Date: (Wed) Apr 08, 2015

Introduction:

Data: Source: Training: https://courses.edx.org/c4x/MITx/15.071x_2/asset/ClaimsData.csv.zip
New:
Time period:

Synopsis:

Based on analysis utilizing <> techniques, :

[](.png)

Potential next steps include:

  • Probability handling for multinomials vs. desired binomial outcome
  • ROCR currently supports only evaluation of binary classification tasks (version 1.0.7)
  • extensions toward multiclass classification are scheduled for the next release

  • Skip trControl.method=“cv” for dummy classifier ?
  • Add custom model to caret for a dummy (baseline) classifier (binomial & multinomial) that generates proba/outcomes which mimics the freq distribution of glb_rsp_var values; Right now glb_dmy_glm_mdl always generates most frequent outcome in training data
  • glm_dmy_mdl should use the same method as glm_sel_mdl until custom dummy classifer is implemented

  • Prediction accuracy scatter graph:
  • Change shapes to c(“+”, “x”) instead of dots & triangles
  • Add tiles (raw vs. PCA)
  • Use shiny for drop-down of “important” features
  • Use plot.ly for interactive plots ?

  • Move glb_analytics_diag_plots to mydsutils.R: (+) Easier to debug (-) Too many glb vars used
  • Add print(ggplot.petrinet(glb_analytics_pn) + coord_flip()) at the end of every major chunk
  • Parameterize glb_analytics_pn
  • Move glb_impute_missing_data to mydsutils.R: (-) Too many glb vars used; glb_<>_df reassigned
  • Replicate myrun_mdl_classification features in myrun_mdl_regression
  • Do non-glm methods handle interaction terms ?
  • f-score computation for classifiers should be summation across outcomes (not just the desired one ?)
  • Add accuracy computation to glb_dmy_mdl in predict.data.new chunk
  • Why does splitting fit.data.training.all chunk into separate chunks add an overhead of ~30 secs ? It’s not rbind b/c other chunks have lower elapsed time. Is it the number of plots ?
  • Incorporate code chunks in print_sessionInfo
  • Test against
    • projects in github.com/bdanalytics
    • lectures in jhu-datascience track

Analysis:

rm(list=ls())
set.seed(12345)
options(stringsAsFactors=FALSE)
source("~/Dropbox/datascience/R/mydsutils.R")
source("~/Dropbox/datascience/R/myplot.R")
source("~/Dropbox/datascience/R/mypetrinet.R")
# Gather all package requirements here
#suppressPackageStartupMessages(require())
#packageVersion("caret")

#require(sos); findFn("pinv", maxPages=2, sortby="MaxScore")

# Analysis control global variables
glb_trnng_url <- "https://courses.edx.org/c4x/MITx/15.071x_2/asset/ClaimsData.csv.zip"
glb_newdt_url <- "<newdt_url>"
glb_is_separate_newent_dataset <- FALSE    # or TRUE
glb_split_entity_newent_datasets <- TRUE   # or FALSE
glb_split_newdata_method <- "sample"          # "condition" or "sample"
glb_split_newdata_condition <- "<col_name> <condition_operator> <value>"    # or NULL
glb_split_newdata_size_ratio <- 0.4               # > 0 & < 1
glb_split_sample.seed <- 88               # or any integer
glb_max_obs <- 5000 # or NULL

glb_is_regression <- FALSE; glb_is_classification <- TRUE

glb_rsp_var_raw <- "bucket2009"

# for classification, the response variable has to be a factor
#   especially for random forests (method="rf")
glb_rsp_var <- "bucket2009.fctr"    # or glb_rsp_var_raw

# if the response factor is based on numbers e.g (0/1 vs. "A"/"B"), 
#   caret predict(..., type="prob") crashes
glb_map_rsp_raw_to_var <- function(raw) {
    as.factor(paste0("B", raw))
} # or NULL
#glb_map_rsp_raw_to_var(c(1, 2, 3, 4, 5))

glb_map_rsp_var_to_raw <- function(var) {
    as.numeric(var)
} # or NULL
#glb_map_rsp_var_to_raw(glb_map_rsp_raw_to_var(c(1, 2, 3, 4, 5)))

if ((glb_rsp_var != glb_rsp_var_raw) & is.null(glb_map_rsp_raw_to_var))
    stop("glb_map_rsp_raw_to_var function expected")

glb_rsp_var_out <- paste0(glb_rsp_var, ".prediction")
glb_id_vars <- NULL # or c("<id_var>")

glb_exclude_vars_as_features <- union(glb_id_vars, c(glb_rsp_var_raw, ".rnorm"))
# List transformed vars  
glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, 
                                      NULL)     # or c("<col_name>")
# List feats that shd be excluded due to known causation by prediction variable
glb_exclude_vars_as_features <- union(union(glb_exclude_vars_as_features, 
                        paste(glb_rsp_var_out, c("", ".proba"), sep="")),
                                    c("reimbursement2009")) # or NULL

glb_impute_na_data <- FALSE            # or TRUE
glb_mice_complete.seed <- 144               # or any integer

glb_fin_mdl <- glb_sel_mdl <- glb_dmy_mdl <- NULL; 
# Classification
#glb_models_method_vctr <- c("glm", "rpart", "rf")   # Binomials
#glb_models_method_vctr <- c("rpart", "rf")          # Multinomials
glb_models_method_vctr <- c("rpart")          # Multinomials - this exercise

glb_models_lst <- list(); glb_models_df <- data.frame()
glb_loss_mtrx <- matrix(c(0,1,2,3,4,
                          2,0,1,2,3,
                          4,2,0,1,2,
                          6,4,2,0,1,
                          8,6,4,2,0
                          ), byrow=TRUE, nrow=5)    # or NULL
glb_loss_smmry_old <- function(data, lev=NULL, model=NULL) {
    confusion_df <- mycompute_confusion_df(data, "obs", "pred")    
    confusion_mtrx <- as.matrix(confusion_df[, -1])
    confusion_mtrx <- cbind(confusion_mtrx, 
                              matrix(rep(0, 
        (nrow(confusion_mtrx) - ncol(confusion_mtrx)) * nrow(confusion_mtrx)), 
                                     byrow=TRUE, nrow=nrow(confusion_mtrx)))
    metric <- sum(confusion_mtrx * glb_loss_mtrx) / nrow(data)
    names(metric) <- "loss.error"
    return(metric)
}
glb_loss_smmry <- function(data, lev=NULL, model=NULL) {
    confusion_mtrx <- as.matrix(confusionMatrix(data$obs, data$pred))
    print(confusion_mtrx)
    metric <- sum(confusion_mtrx * glb_loss_mtrx) / nrow(data)
    names(metric) <- "loss.error"
    return(metric)
}
glb_loss_smmry_t <- function(data, lev=NULL, model=NULL) {
    confusion_mtrx <- as.matrix(confusionMatrix(data$obs, data$pred))
    print(confusion_mtrx)
    metric <- sum(confusion_mtrx * t(glb_loss_mtrx)) / nrow(data)
    names(metric) <- "loss.error"
    return(metric)
}

glb_tune_models_df <- 
   rbind(
    data.frame(parameter="cp", min=0.00, max=0.04, by=0.01), 
    data.frame(parameter="mtry", min=2, max=4, by=1)
        ) 
# or NULL
glb_n_cv_folds <- 3 # or NULL

glb_clf_proba_threshold <- NULL # 0.5

# Baseline prediction model feature(s)
glb_bsl_mdl_var <- c("bucket2008") # or NULL

# Depict process
glb_analytics_pn <- petrinet(name="glb_analytics_pn",
                        trans_df=data.frame(id=1:6,
    name=c("data.training.all","data.new",
           "model.selected","model.final",
           "data.training.all.prediction","data.new.prediction"),
    x=c(   -5,-5,-15,-25,-25,-35),
    y=c(   -5, 5,  0,  0, -5,  5)
                        ),
                        places_df=data.frame(id=1:4,
    name=c("bgn","fit.data.training.all","predict.data.new","end"),
    x=c(   -0,   -20,                    -30,               -40),
    y=c(    0,     0,                      0,                 0),
    M0=c(   3,     0,                      0,                 0)
                        ),
                        arcs_df=data.frame(
    begin=c("bgn","bgn","bgn",        
            "data.training.all","model.selected","fit.data.training.all",
            "fit.data.training.all","model.final",    
            "data.new","predict.data.new",
            "data.training.all.prediction","data.new.prediction"),
    end  =c("data.training.all","data.new","model.selected",
            "fit.data.training.all","fit.data.training.all","model.final",
            "data.training.all.prediction","predict.data.new",
            "predict.data.new","data.new.prediction",
            "end","end")
                        ))
#print(ggplot.petrinet(glb_analytics_pn))
print(ggplot.petrinet(glb_analytics_pn) + coord_flip())
## Loading required package: grid

glb_analytics_avl_objs <- NULL

glb_script_tm <- proc.time()
glb_script_df <- data.frame(chunk_label="import_data", 
                            chunk_step_major=1, chunk_step_minor=0,
                            elapsed=(proc.time() - glb_script_tm)["elapsed"])
print(tail(glb_script_df, 2))
##         chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed import_data                1                0   0.002

Step 1: import data

glb_entity_df <- myimport_data(
    url=glb_trnng_url, 
    comment="glb_entity_df", force_header=TRUE,
    print_diagn=(glb_is_separate_newent_dataset | 
                !glb_split_entity_newent_datasets))
## [1] "Reading file ./data/ClaimsData.csv..."
## [1] "dimensions of data in ./data/ClaimsData.csv: 458,005 rows x 16 cols"
if (glb_is_separate_newent_dataset) {
    glb_newent_df <- myimport_data(
        url=glb_newdt_url, 
        comment="glb_newent_df", force_header=TRUE, print_diagn=TRUE)
} else {
    if (!glb_split_entity_newent_datasets) {
        stop("Not implemented yet") 
        glb_newent_df <- glb_entity_df[sample(1:nrow(glb_entity_df),
                                          max(2, nrow(glb_entity_df) / 1000)),]                    
    } else      if (glb_split_newdata_method == "condition") {
            glb_newent_df <- do.call("subset", 
                list(glb_entity_df, parse(text=glb_split_newdata_condition)))
            glb_entity_df <- do.call("subset", 
                list(glb_entity_df, parse(text=paste0("!(", 
                                                      glb_split_newdata_condition,
                                                      ")"))))
        } else if (glb_split_newdata_method == "sample") {
                require(caTools)
                
                set.seed(glb_split_sample.seed)
                split <- sample.split(glb_entity_df[, glb_rsp_var_raw], 
                                      SplitRatio=(1-glb_split_newdata_size_ratio))
                glb_newent_df <- glb_entity_df[!split, ] 
                glb_entity_df <- glb_entity_df[split ,]
        } else stop("glb_split_newdata_method should be %in% c('condition', 'sample')")   

    comment(glb_newent_df) <- "glb_newent_df"
    myprint_df(glb_newent_df)
    str(glb_newent_df)

    if (glb_split_entity_newent_datasets) {
        myprint_df(glb_entity_df)
        str(glb_entity_df)        
    }
}         
## Loading required package: caTools
##    age alzheimers arthritis cancer copd depression diabetes heart.failure
## 3   67          0         0      0    0          0        0             0
## 5   67          0         0      0    0          0        0             0
## 6   68          0         0      0    0          0        0             0
## 8   70          0         0      0    0          0        0             0
## 9   67          0         0      0    0          0        0             0
## 10  67          0         0      0    0          0        0             0
##    ihd kidney osteoporosis stroke reimbursement2008 bucket2008
## 3    0      0            0      0                 0          1
## 5    0      0            0      0                 0          1
## 6    0      0            0      0                 0          1
## 8    0      0            0      0                 0          1
## 9    0      0            0      0                 0          1
## 10   0      0            0      0                 0          1
##    reimbursement2009 bucket2009
## 3                  0          1
## 5                  0          1
## 6                  0          1
## 8                  0          1
## 9                  0          1
## 10                 0          1
##        age alzheimers arthritis cancer copd depression diabetes
## 43967   57          0         0      0    0          0        0
## 70246   70          0         0      0    0          0        0
## 165755  78          0         0      0    0          0        0
## 208131  73          0         1      1    0          0        0
## 319113  87          0         0      0    0          1        1
## 446073  72          1         0      1    0          1        1
##        heart.failure ihd kidney osteoporosis stroke reimbursement2008
## 43967              0   0      0            0      0                 0
## 70246              0   0      0            0      0                 0
## 165755             0   0      0            0      0               140
## 208131             0   0      0            1      0              5680
## 319113             0   1      0            0      0              2800
## 446073             1   1      0            1      1             16030
##        bucket2008 reimbursement2009 bucket2009
## 43967           1                 0          1
## 70246           1                 0          1
## 165755          1               720          1
## 208131          2              1250          1
## 319113          1              3330          2
## 446073          3             28000          4
##        age alzheimers arthritis cancer copd depression diabetes
## 457996  60          0         1      0    1          1        1
## 457998  87          0         0      0    1          1        1
## 458001  61          1         0      0    1          1        1
## 458002  90          1         0      0    1          1        1
## 458003  76          0         1      0    1          1        1
## 458005  80          1         0      0    1          1        1
##        heart.failure ihd kidney osteoporosis stroke reimbursement2008
## 457996             1   1      0            1      1             11720
## 457998             1   1      1            0      0             27750
## 458001             1   1      1            1      1             15960
## 458002             1   1      1            0      0             26870
## 458003             1   1      1            1      1             89140
## 458005             1   1      1            0      1             38320
##        bucket2008 reimbursement2009 bucket2009
## 457996          3            142960          5
## 457998          4            148600          5
## 458001          3            154000          5
## 458002          4            155010          5
## 458003          5            155810          5
## 458005          4            189930          5
## 'data.frame':    183202 obs. of  16 variables:
##  $ age              : int  67 67 68 70 67 67 56 48 99 68 ...
##  $ alzheimers       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ arthritis        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cancer           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ copd             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ depression       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diabetes         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ heart.failure    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ihd              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kidney           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ osteoporosis     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ stroke           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ reimbursement2008: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bucket2008       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ reimbursement2009: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bucket2009       : int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "comment")= chr "glb_newent_df"
##    age alzheimers arthritis cancer copd depression diabetes heart.failure
## 1   85          0         0      0    0          0        0             0
## 2   59          0         0      0    0          0        0             0
## 4   52          0         0      0    0          0        0             0
## 7   75          0         0      0    0          0        0             0
## 11  89          0         0      0    0          0        0             0
## 13  74          0         0      0    0          0        0             0
##    ihd kidney osteoporosis stroke reimbursement2008 bucket2008
## 1    0      0            0      0                 0          1
## 2    0      0            0      0                 0          1
## 4    0      0            0      0                 0          1
## 7    0      0            0      0                 0          1
## 11   0      0            0      0                 0          1
## 13   0      0            0      0                 0          1
##    reimbursement2009 bucket2009
## 1                  0          1
## 2                  0          1
## 4                  0          1
## 7                  0          1
## 11                 0          1
## 13                 0          1
##        age alzheimers arthritis cancer copd depression diabetes
## 138659  69          0         0      0    0          0        0
## 168428  74          1         0      0    0          0        1
## 189703  81          0         0      0    0          0        1
## 225640  78          1         0      0    0          1        0
## 382169  77          1         0      0    1          1        1
## 397881  46          1         0      0    0          0        0
##        heart.failure ihd kidney osteoporosis stroke reimbursement2008
## 138659             0   0      0            0      0                 0
## 168428             0   0      0            1      0               720
## 189703             0   1      0            0      0               690
## 225640             0   0      0            1      0              1540
## 382169             1   1      1            1      1             16400
## 397881             1   1      1            0      0              3700
##        bucket2008 reimbursement2009 bucket2009
## 138659          1               380          1
## 168428          1               750          1
## 189703          1              1020          1
## 225640          1              1490          1
## 382169          3              6620          2
## 397881          2              8470          3
##        age alzheimers arthritis cancer copd depression diabetes
## 457991  76          0         0      0    1          1        1
## 457992  84          0         0      0    1          0        1
## 457997  73          0         0      0    1          1        1
## 457999  83          1         1      0    1          0        1
## 458000  56          0         1      0    1          1        1
## 458004  82          1         0      0    1          0        1
##        heart.failure ihd kidney osteoporosis stroke reimbursement2008
## 457991             1   1      1            1      0             53550
## 457992             1   1      1            0      0              8620
## 457997             1   1      1            1      0             53230
## 457999             1   1      1            1      1             62620
## 458000             1   1      1            1      0             62980
## 458004             1   1      1            1      1             20660
##        bucket2008 reimbursement2009 bucket2009
## 457991          4            131960          5
## 457992          3            133500          5
## 457997          4            147760          5
## 457999          5            148860          5
## 458000          5            151880          5
## 458004          4            158800          5
## 'data.frame':    274803 obs. of  16 variables:
##  $ age              : int  85 59 52 75 89 74 81 86 78 67 ...
##  $ alzheimers       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ arthritis        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cancer           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ copd             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ depression       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ diabetes         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ heart.failure    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ihd              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ kidney           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ osteoporosis     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ stroke           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ reimbursement2008: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bucket2008       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ reimbursement2009: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bucket2009       : int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "comment")= chr "glb_entity_df"
if (!is.null(glb_max_obs)) {
    warning("glb_<ent>_df restricted to glb_max_obs: ", format(glb_max_obs, big.mark=","))
    glb_entity_df <- glb_entity_df[split <- 
        sample.split(glb_entity_df[, glb_rsp_var_raw], SplitRatio=glb_max_obs), ]
    glb_newent_df <- glb_newent_df[split <- 
        sample.split(glb_newent_df[, glb_rsp_var_raw], SplitRatio=glb_max_obs), ]
}
## Warning: glb_<ent>_df restricted to glb_max_obs: 5,000
glb_script_df <- rbind(glb_script_df,
                   data.frame(chunk_label="cleanse_data", 
                              chunk_step_major=max(glb_script_df$chunk_step_major)+1, 
                              chunk_step_minor=0,
                              elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
##           chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed   import_data                1                0   0.002
## elapsed1 cleanse_data                2                0   6.270

Step 2: cleanse data

glb_script_df <- rbind(glb_script_df, 
                   data.frame(chunk_label="inspectORexplore.data", 
                              chunk_step_major=max(glb_script_df$chunk_step_major), 
                              chunk_step_minor=1,
                              elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
##                    chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed1          cleanse_data                2                0   6.270
## elapsed2 inspectORexplore.data                2                1   6.301

Step 2.1: inspect/explore data

#print(str(glb_entity_df))
#View(glb_entity_df)

# List info gathered for various columns
# <col_name>:   <description>; <notes>

# Create new features that help diagnostics
#   Create factors of string variables
str_vars <- sapply(1:length(names(glb_entity_df)), 
    function(col) ifelse(class(glb_entity_df[, names(glb_entity_df)[col]]) == "character",
                         names(glb_entity_df)[col], ""))
if (length(str_vars <- setdiff(str_vars[str_vars != ""], 
                               glb_exclude_vars_as_features)) > 0) {
    warning("Creating factors of string variables:", paste0(str_vars, collapse=", "))
    glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, str_vars)
    for (var in str_vars) {
        glb_entity_df[, paste0(var, ".fctr")] <- factor(glb_entity_df[, var], 
                        as.factor(union(glb_entity_df[, var], glb_newent_df[, var])))
        glb_newent_df[, paste0(var, ".fctr")] <- factor(glb_newent_df[, var], 
                        as.factor(union(glb_entity_df[, var], glb_newent_df[, var])))
    }
}

#   Convert factors to dummy variables
#   Build splines   require(splines); bsBasis <- bs(training$age, df=3)

add_new_diag_feats <- function(obs_df, obs_twin_df) {
    require(plyr)
    
    obs_df <- mutate(obs_df,
#         <col_name>.NA=is.na(<col_name>),

#         <col_name>.fctr=factor(<col_name>, 
#                     as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))), 
#         <col_name>.fctr=relevel(factor(<col_name>, 
#                     as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))),
#                                   "<ref_val>"), 
#         <col2_name>.fctr=relevel(factor(ifelse(<col1_name> == <val>, "<oth_val>", "<ref_val>")), 
#                               as.factor(c("R", "<ref_val>")),
#                               ref="<ref_val>"),

          # This doesn't work - use sapply instead
#         <col_name>.fctr_num=grep(<col_name>, levels(<col_name>.fctr)), 
#         
#         Date.my=as.Date(strptime(Date, "%m/%d/%y %H:%M")),
#         Year=year(Date.my),
#         Month=months(Date.my),
#         Weekday=weekdays(Date.my)

#         <col_name>.log=log(<col.name>),        
#         <col_name>=<table>[as.character(<col2_name>)],
#         <col_name>=as.numeric(<col2_name>),

        .rnorm=rnorm(n=nrow(obs_df))
                        )

    # If levels of a factor are different across obs_df & glb_newent_df; predict.glm fails  
    # Transformations not handled by mutate
#     obs_df$<col_name>.fctr.num <- sapply(1:nrow(obs_df), 
#         function(row_ix) grep(obs_df[row_ix, "<col_name>"],
#                               levels(obs_df[row_ix, "<col_name>.fctr"])))
    
    print(summary(obs_df))
    print(sapply(names(obs_df), function(col) sum(is.na(obs_df[, col]))))
    return(obs_df)
}

glb_entity_df <- add_new_diag_feats(glb_entity_df, glb_newent_df)
## Loading required package: plyr
##       age          alzheimers       arthritis         cancer      
##  Min.   : 26.0   Min.   :0.0000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.: 67.0   1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median : 73.0   Median :0.0000   Median :0.000   Median :0.0000  
##  Mean   : 72.2   Mean   :0.1966   Mean   :0.153   Mean   :0.0592  
##  3rd Qu.: 80.0   3rd Qu.:0.0000   3rd Qu.:0.000   3rd Qu.:0.0000  
##  Max.   :100.0   Max.   :1.0000   Max.   :1.000   Max.   :1.0000  
##       copd          depression        diabetes      heart.failure   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.1342   Mean   :0.2066   Mean   :0.3758   Mean   :0.2862  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       ihd             kidney        osteoporosis        stroke      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.4218   Mean   :0.1624   Mean   :0.1802   Mean   :0.0402  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  reimbursement2008   bucket2008    reimbursement2009   bucket2009   
##  Min.   :     0    Min.   :1.000   Min.   :     0    Min.   :1.000  
##  1st Qu.:     0    1st Qu.:1.000   1st Qu.:   160    1st Qu.:1.000  
##  Median :   955    Median :1.000   Median :  1505    Median :1.000  
##  Mean   :  3937    Mean   :1.433   Mean   :  4310    Mean   :1.522  
##  3rd Qu.:  3080    3rd Qu.:2.000   3rd Qu.:  4180    3rd Qu.:2.000  
##  Max.   :133330    Max.   :5.000   Max.   :131960    Max.   :5.000  
##      .rnorm         
##  Min.   :-3.323274  
##  1st Qu.:-0.709937  
##  Median :-0.015250  
##  Mean   :-0.009294  
##  3rd Qu.: 0.681998  
##  Max.   : 3.675877  
##               age        alzheimers         arthritis            cancer 
##                 0                 0                 0                 0 
##              copd        depression          diabetes     heart.failure 
##                 0                 0                 0                 0 
##               ihd            kidney      osteoporosis            stroke 
##                 0                 0                 0                 0 
## reimbursement2008        bucket2008 reimbursement2009        bucket2009 
##                 0                 0                 0                 0 
##            .rnorm 
##                 0
glb_newent_df <- add_new_diag_feats(glb_newent_df, glb_entity_df)
##       age           alzheimers       arthritis          cancer      
##  Min.   : 26.00   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 67.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 73.00   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   : 72.57   Mean   :0.1904   Mean   :0.1546   Mean   :0.0666  
##  3rd Qu.: 81.00   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :100.00   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       copd          depression        diabetes      heart.failure   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.1398   Mean   :0.2102   Mean   :0.3794   Mean   :0.2856  
##  3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##       ihd             kidney        osteoporosis        stroke     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.000  
##  Mean   :0.4206   Mean   :0.1646   Mean   :0.1718   Mean   :0.048  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.000  
##  reimbursement2008   bucket2008   reimbursement2009   bucket2009   
##  Min.   :     0    Min.   :1.00   Min.   :     0    Min.   :1.000  
##  1st Qu.:     0    1st Qu.:1.00   1st Qu.:   110    1st Qu.:1.000  
##  Median :   945    Median :1.00   Median :  1525    Median :1.000  
##  Mean   :  4044    Mean   :1.44   Mean   :  4255    Mean   :1.522  
##  3rd Qu.:  3130    3rd Qu.:2.00   3rd Qu.:  4170    3rd Qu.:2.000  
##  Max.   :173320    Max.   :5.00   Max.   :103870    Max.   :5.000  
##      .rnorm         
##  Min.   :-3.190467  
##  1st Qu.:-0.697116  
##  Median :-0.004068  
##  Mean   :-0.005770  
##  3rd Qu.: 0.675809  
##  Max.   : 3.411899  
##               age        alzheimers         arthritis            cancer 
##                 0                 0                 0                 0 
##              copd        depression          diabetes     heart.failure 
##                 0                 0                 0                 0 
##               ihd            kidney      osteoporosis            stroke 
##                 0                 0                 0                 0 
## reimbursement2008        bucket2008 reimbursement2009        bucket2009 
##                 0                 0                 0                 0 
##            .rnorm 
##                 0
# Histogram of predictor in glb_entity_df & glb_newent_df
print(myplot_histogram(glb_entity_df, glb_rsp_var_raw))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

if (glb_is_classification)
    print(table(glb_entity_df[, glb_rsp_var_raw]) / nrow(glb_entity_df))
## 
##      1      2      3      4      5 
## 0.6712 0.1902 0.0894 0.0434 0.0058
print(myplot_histogram(glb_newent_df, glb_rsp_var_raw))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

# Check for duplicates in glb_id_vars
if (length(glb_id_vars) > 0) {
    id_vars_dups_df <- subset(id_vars_df <- mycreate_tbl_df(
        rbind(glb_entity_df[, glb_id_vars, FALSE], glb_newent_df[, glb_id_vars, FALSE]),
            glb_id_vars), .freq > 1)
    if (nrow(id_vars_dups_df) > 0) {
        warning("Duplicates found in glb_id_vars data:", nrow(id_vars_dups_df))
        myprint_df(id_vars_dups_df)
    } else {
        # glb_id_vars are unique across obs in both glb_<>_df
        glb_exclude_vars_as_features <- union(glb_exclude_vars_as_features, glb_id_vars)
    }
}

#pairs(subset(glb_entity_df, select=-c(col_symbol)))
# Check for glb_newent_df & glb_entity_df features range mismatches

# Other diagnostics:
# print(subset(glb_entity_df, <col1_name> == max(glb_entity_df$<col1_name>, na.rm=TRUE) & 
#                         <col2_name> <= mean(glb_entity_df$<col1_name>, na.rm=TRUE)))

# print(glb_entity_df[which.max(glb_entity_df$<col_name>),])

# print(<col_name>_freq_glb_entity_df <- mycreate_tbl_df(glb_entity_df, "<col_name>"))
# print(which.min(table(glb_entity_df$<col_name>)))
# print(which.max(table(glb_entity_df$<col_name>)))
# print(which.max(table(glb_entity_df$<col1_name>, glb_entity_df$<col2_name>)[, 2]))
# print(table(glb_entity_df$<col1_name>, glb_entity_df$<col2_name>))
# print(table(is.na(glb_entity_df$<col1_name>), glb_entity_df$<col2_name>))
# print(table(sign(glb_entity_df$<col1_name>), glb_entity_df$<col2_name>))
# print(mycreate_xtab(glb_entity_df, <col1_name>))
# print(mycreate_xtab(glb_entity_df, c(<col1_name>, <col2_name>)))
# print(<col1_name>_<col2_name>_xtab_glb_entity_df <- 
#   mycreate_xtab(glb_entity_df, c("<col1_name>", "<col2_name>")))
# <col1_name>_<col2_name>_xtab_glb_entity_df[is.na(<col1_name>_<col2_name>_xtab_glb_entity_df)] <- 0
# print(<col1_name>_<col2_name>_xtab_glb_entity_df <- 
#   mutate(<col1_name>_<col2_name>_xtab_glb_entity_df, 
#             <col3_name>=(<col1_name> * 1.0) / (<col1_name> + <col2_name>))) 

# print(<col2_name>_min_entity_arr <- 
#    sort(tapply(glb_entity_df$<col1_name>, glb_entity_df$<col2_name>, min, na.rm=TRUE)))
# print(<col1_name>_na_by_<col2_name>_arr <- 
#    sort(tapply(glb_entity_df$<col1_name>.NA, glb_entity_df$<col2_name>, mean, na.rm=TRUE)))

# Other plots:
# print(myplot_box(df=glb_entity_df, ycol_names="<col1_name>"))
# print(myplot_box(df=glb_entity_df, ycol_names="<col1_name>", xcol_name="<col2_name>"))
# print(myplot_line(subset(glb_entity_df, Symbol %in% c("KO", "PG")), 
#                   "Date.my", "StockPrice", facet_row_colnames="Symbol") + 
#     geom_vline(xintercept=as.numeric(as.Date("2003-03-01"))) +
#     geom_vline(xintercept=as.numeric(as.Date("1983-01-01")))        
#         )
# print(myplot_scatter(glb_entity_df, "<col1_name>", "<col2_name>", smooth=TRUE))
# print(myplot_scatter(glb_entity_df, "<col1_name>", "<col2_name>", colorcol_name="<Pred.fctr>"))

glb_script_df <- rbind(glb_script_df, 
    data.frame(chunk_label="manage_missing_data", 
        chunk_step_major=max(glb_script_df$chunk_step_major), 
        chunk_step_minor=glb_script_df[nrow(glb_script_df), "chunk_step_minor"]+1,
                              elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
##                    chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed2 inspectORexplore.data                2                1   6.301
## elapsed3   manage_missing_data                2                2   7.808

Step 2.2: manage missing data

# print(sapply(names(glb_entity_df), function(col) sum(is.na(glb_entity_df[, col]))))
# print(sapply(names(glb_newent_df), function(col) sum(is.na(glb_newent_df[, col]))))
# glb_entity_df <- na.omit(glb_entity_df)
# glb_newent_df <- na.omit(glb_newent_df)
# df[is.na(df)] <- 0

# Not refactored into mydsutils.R since glb_*_df might be reassigned
glb_impute_missing_data <- function(entity_df, newent_df) {
    if (!glb_is_separate_newent_dataset) {
        # Combine entity & newent
        union_df <- rbind(mutate(entity_df, .src = "entity"),
                          mutate(newent_df, .src = "newent"))
        union_imputed_df <- union_df[, setdiff(setdiff(names(entity_df), 
                                                       glb_rsp_var), 
                                               glb_exclude_vars_as_features)]
        print(summary(union_imputed_df))
    
        require(mice)
        set.seed(glb_mice_complete.seed)
        union_imputed_df <- complete(mice(union_imputed_df))
        print(summary(union_imputed_df))
    
        union_df[, names(union_imputed_df)] <- union_imputed_df[, names(union_imputed_df)]
        print(summary(union_df))
#         union_df$.rownames <- rownames(union_df)
#         union_df <- orderBy(~.rownames, union_df)
#         
#         imp_entity_df <- myimport_data(
#             url="<imputed_trnng_url>", 
#             comment="imp_entity_df", force_header=TRUE, print_diagn=TRUE)
#         print(all.equal(subset(union_df, select=-c(.src, .rownames, .rnorm)), 
#                         imp_entity_df))
        
        # Partition again
        glb_entity_df <<- subset(union_df, .src == "entity", select=-c(.src, .rownames))
        comment(glb_entity_df) <- "entity_df"
        glb_newent_df <<- subset(union_df, .src == "newent", select=-c(.src, .rownames))
        comment(glb_newent_df) <- "newent_df"
        
        # Generate summaries
        print(summary(entity_df))
        print(sapply(names(entity_df), function(col) sum(is.na(entity_df[, col]))))
        print(summary(newent_df))
        print(sapply(names(newent_df), function(col) sum(is.na(newent_df[, col]))))
    
    } else stop("Not implemented yet")
}

if (glb_impute_na_data) {
    if ((sum(sapply(names(glb_entity_df), 
                    function(col) sum(is.na(glb_entity_df[, col])))) > 0) | 
        (sum(sapply(names(glb_newent_df), 
                    function(col) sum(is.na(glb_newent_df[, col])))) > 0))
        glb_impute_missing_data(glb_entity_df, glb_newent_df)
}    

glb_script_df <- rbind(glb_script_df, 
    data.frame(chunk_label="encode_retype_data", 
        chunk_step_major=max(glb_script_df$chunk_step_major), 
        chunk_step_minor=glb_script_df[nrow(glb_script_df), "chunk_step_minor"]+1,
                              elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
##                  chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed3 manage_missing_data                2                2   7.808
## elapsed4  encode_retype_data                2                3   8.228

Step 2.3: encode/retype data

# map_<col_name>_df <- myimport_data(
#     url="<map_url>", 
#     comment="map_<col_name>_df", print_diagn=TRUE)
# map_<col_name>_df <- read.csv(paste0(getwd(), "/data/<file_name>.csv"), strip.white=TRUE)

# glb_entity_df <- mymap_codes(glb_entity_df, "<from_col_name>", "<to_col_name>", 
#     map_<to_col_name>_df, map_join_col_name="<map_join_col_name>", 
#                           map_tgt_col_name="<to_col_name>")
# glb_newent_df <- mymap_codes(glb_newent_df, "<from_col_name>", "<to_col_name>", 
#     map_<to_col_name>_df, map_join_col_name="<map_join_col_name>", 
#                           map_tgt_col_name="<to_col_name>")
                        
# glb_entity_df$<col_name>.fctr <- factor(glb_entity_df$<col_name>, 
#                     as.factor(union(glb_entity_df$<col_name>, glb_newent_df$<col_name>)))
# glb_newent_df$<col_name>.fctr <- factor(glb_newent_df$<col_name>, 
#                     as.factor(union(glb_entity_df$<col_name>, glb_newent_df$<col_name>)))

if (!is.null(glb_map_rsp_raw_to_var)) {
    glb_entity_df[, glb_rsp_var] <- 
        glb_map_rsp_raw_to_var(glb_entity_df[, glb_rsp_var_raw])
    mycheck_map_results(mapd_df=glb_entity_df, 
                        from_col_name=glb_rsp_var_raw, to_col_name=glb_rsp_var)
        
    glb_newent_df[, glb_rsp_var] <- 
        glb_map_rsp_raw_to_var(glb_newent_df[, glb_rsp_var_raw])
    mycheck_map_results(mapd_df=glb_newent_df, 
                        from_col_name=glb_rsp_var_raw, to_col_name=glb_rsp_var)    
}
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
## Loading required package: tcltk
##   bucket2009 bucket2009.fctr   .n
## 1          1              B1 3356
## 2          2              B2  951
## 3          3              B3  447
## 4          4              B4  217
## 5          5              B5   29

##   bucket2009 bucket2009.fctr   .n
## 1          1              B1 3356
## 2          2              B2  951
## 3          3              B3  447
## 4          4              B4  217
## 5          5              B5   29

glb_script_df <- rbind(glb_script_df, 
                   data.frame(chunk_label="extract_features", 
                              chunk_step_major=max(glb_script_df$chunk_step_major)+1, 
                              chunk_step_minor=0,
                              elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
##                 chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed4 encode_retype_data                2                3   8.228
## elapsed5   extract_features                3                0  10.942

Step 3: extract features

# Create new features that help prediction
# <col_name>.lag.2 <- lag(zoo(glb_entity_df$<col_name>), -2, na.pad=TRUE)
# glb_entity_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
# <col_name>.lag.2 <- lag(zoo(glb_newent_df$<col_name>), -2, na.pad=TRUE)
# glb_newent_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
# 
# glb_newent_df[1, "<col_name>.lag.2"] <- glb_entity_df[nrow(glb_entity_df) - 1, 
#                                                    "<col_name>"]
# glb_newent_df[2, "<col_name>.lag.2"] <- glb_entity_df[nrow(glb_entity_df), 
#                                                    "<col_name>"]
                                                   
# glb_entity_df <- mutate(glb_entity_df,
#     <new_col_name>=
#                     )

# glb_newent_df <- mutate(glb_newent_df,
#     <new_col_name>=
#                     )

# print(summary(glb_entity_df))
# print(summary(glb_newent_df))

# print(sapply(names(glb_entity_df), function(col) sum(is.na(glb_entity_df[, col]))))
# print(sapply(names(glb_newent_df), function(col) sum(is.na(glb_newent_df[, col]))))

# print(myplot_scatter(glb_entity_df, "<col1_name>", "<col2_name>", smooth=TRUE))

replay.petrisim(pn=glb_analytics_pn, 
    replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs, 
        "data.training.all","data.new")), flip_coord=TRUE)
## time trans    "bgn " "fit.data.training.all " "predict.data.new " "end " 
## 0.0000   multiple enabled transitions:  data.training.all data.new model.selected    firing:  data.training.all 
## 1.0000    1   2 1 0 0 
## 1.0000   multiple enabled transitions:  data.training.all data.new model.selected model.final data.training.all.prediction   firing:  data.new 
## 2.0000    2   1 1 1 0

glb_script_df <- rbind(glb_script_df, 
                   data.frame(chunk_label="select_features", 
                              chunk_step_major=max(glb_script_df$chunk_step_major)+1, 
                              chunk_step_minor=0,
                              elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
##               chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed5 extract_features                3                0  10.942
## elapsed6  select_features                4                0  12.299

Step 4: select features

print(glb_feats_df <- 
    myselect_features( entity_df=glb_entity_df, 
                       exclude_vars_as_features=glb_exclude_vars_as_features, 
                       rsp_var=glb_rsp_var))
##                                  id      cor.y  cor.y.abs
## bucket2008               bucket2008 0.46125446 0.46125446
## diabetes                   diabetes 0.40896269 0.40896269
## ihd                             ihd 0.40798592 0.40798592
## reimbursement2008 reimbursement2008 0.40427780 0.40427780
## kidney                       kidney 0.38971036 0.38971036
## heart.failure         heart.failure 0.37320420 0.37320420
## copd                           copd 0.31969314 0.31969314
## depression               depression 0.28668189 0.28668189
## alzheimers               alzheimers 0.27882540 0.27882540
## arthritis                 arthritis 0.25224332 0.25224332
## osteoporosis           osteoporosis 0.21770017 0.21770017
## cancer                       cancer 0.19499896 0.19499896
## stroke                       stroke 0.16603039 0.16603039
## age                             age 0.02529723 0.02529723
glb_script_df <- rbind(glb_script_df, 
    data.frame(chunk_label="remove_correlated_features", 
        chunk_step_major=max(glb_script_df$chunk_step_major),
        chunk_step_minor=glb_script_df[nrow(glb_script_df), "chunk_step_minor"]+1,
                              elapsed=(proc.time() - glb_script_tm)["elapsed"]))        
print(tail(glb_script_df, 2))
##                         chunk_label chunk_step_major chunk_step_minor
## elapsed6            select_features                4                0
## elapsed7 remove_correlated_features                4                1
##          elapsed
## elapsed6  12.299
## elapsed7  12.506

Step 4.1: remove correlated features

print(glb_feats_df <- orderBy(~-cor.y, merge(glb_feats_df, 
          mydelete_cor_features(glb_feats_df, glb_entity_df, glb_rsp_var, 
                                glb_exclude_vars_as_features), 
          all.x=TRUE)))
## Loading required package: reshape2
##                   bucket2008   diabetes        ihd reimbursement2008
## bucket2008        1.00000000 0.44339213 0.46050245        0.85839781
## diabetes          0.44339213 1.00000000 0.50290750        0.36792260
## ihd               0.46050245 0.50290750 1.00000000        0.37521673
## reimbursement2008 0.85839781 0.36792260 0.37521673        1.00000000
## kidney            0.55344410 0.42193676 0.37718207        0.49215940
## heart.failure     0.48346348 0.44605794 0.46628922        0.40927692
## copd              0.47983017 0.32326053 0.34570377        0.41333037
## depression        0.34527821 0.35473684 0.32037384        0.29413115
## alzheimers        0.36445940 0.35596309 0.34480425        0.31961378
## arthritis         0.31131326 0.31374750 0.30973675        0.25334749
## osteoporosis      0.24670276 0.27977336 0.28444797        0.19493180
## cancer            0.30809453 0.18856049 0.20274567        0.28160318
## stroke            0.30742836 0.18807697 0.19837708        0.29564870
## age               0.04109362 0.05090513 0.04827641        0.01954279
##                       kidney heart.failure       copd depression
## bucket2008        0.55344410    0.48346348 0.47983017  0.3452782
## diabetes          0.42193676    0.44605794 0.32326053  0.3547368
## ihd               0.37718207    0.46628922 0.34570377  0.3203738
## reimbursement2008 0.49215940    0.40927692 0.41333037  0.2941311
## kidney            1.00000000    0.41464490 0.36435540  0.2735583
## heart.failure     0.41464490    1.00000000 0.38419755  0.3096900
## copd              0.36435540    0.38419755 1.00000000  0.2613975
## depression        0.27355830    0.30968997 0.26139746  1.0000000
## alzheimers        0.31568268    0.30581106 0.23485872  0.2721016
## arthritis         0.23614435    0.25328348 0.21569437  0.2277253
## osteoporosis      0.19565620    0.22465085 0.18942430  0.2028810
## cancer            0.16987423    0.15428262 0.16477714  0.1378292
## stroke            0.21079850    0.21957931 0.19124785  0.1571124
## age               0.02862807    0.05757169 0.03546447  0.0118616
##                   alzheimers  arthritis osteoporosis     cancer     stroke
## bucket2008        0.36445940 0.31131326   0.24670276 0.30809453 0.30742836
## diabetes          0.35596309 0.31374750   0.27977336 0.18856049 0.18807697
## ihd               0.34480425 0.30973675   0.28444797 0.20274567 0.19837708
## reimbursement2008 0.31961378 0.25334749   0.19493180 0.28160318 0.29564870
## kidney            0.31568268 0.23614435   0.19565620 0.16987423 0.21079850
## heart.failure     0.30581106 0.25328348   0.22465085 0.15428262 0.21957931
## copd              0.23485872 0.21569437   0.18942430 0.16477714 0.19124785
## depression        0.27210156 0.22772525   0.20288098 0.13782921 0.15711241
## alzheimers        1.00000000 0.21052940   0.20145419 0.12966182 0.21387945
## arthritis         0.21052940 1.00000000   0.25750741 0.09819606 0.10817740
## osteoporosis      0.20145419 0.25750741   1.00000000 0.10288245 0.12657256
## cancer            0.12966182 0.09819606   0.10288245 1.00000000 0.06946476
## stroke            0.21387945 0.10817740   0.12657256 0.06946476 1.00000000
## age               0.02391479 0.04090936   0.02569856 0.01201320 0.04427714
##                          age
## bucket2008        0.04109362
## diabetes          0.05090513
## ihd               0.04827641
## reimbursement2008 0.01954279
## kidney            0.02862807
## heart.failure     0.05757169
## copd              0.03546447
## depression        0.01186160
## alzheimers        0.02391479
## arthritis         0.04090936
## osteoporosis      0.02569856
## cancer            0.01201320
## stroke            0.04427714
## age               1.00000000
##                   bucket2008   diabetes        ihd reimbursement2008
## bucket2008        0.00000000 0.44339213 0.46050245        0.85839781
## diabetes          0.44339213 0.00000000 0.50290750        0.36792260
## ihd               0.46050245 0.50290750 0.00000000        0.37521673
## reimbursement2008 0.85839781 0.36792260 0.37521673        0.00000000
## kidney            0.55344410 0.42193676 0.37718207        0.49215940
## heart.failure     0.48346348 0.44605794 0.46628922        0.40927692
## copd              0.47983017 0.32326053 0.34570377        0.41333037
## depression        0.34527821 0.35473684 0.32037384        0.29413115
## alzheimers        0.36445940 0.35596309 0.34480425        0.31961378
## arthritis         0.31131326 0.31374750 0.30973675        0.25334749
## osteoporosis      0.24670276 0.27977336 0.28444797        0.19493180
## cancer            0.30809453 0.18856049 0.20274567        0.28160318
## stroke            0.30742836 0.18807697 0.19837708        0.29564870
## age               0.04109362 0.05090513 0.04827641        0.01954279
##                       kidney heart.failure       copd depression
## bucket2008        0.55344410    0.48346348 0.47983017  0.3452782
## diabetes          0.42193676    0.44605794 0.32326053  0.3547368
## ihd               0.37718207    0.46628922 0.34570377  0.3203738
## reimbursement2008 0.49215940    0.40927692 0.41333037  0.2941311
## kidney            0.00000000    0.41464490 0.36435540  0.2735583
## heart.failure     0.41464490    0.00000000 0.38419755  0.3096900
## copd              0.36435540    0.38419755 0.00000000  0.2613975
## depression        0.27355830    0.30968997 0.26139746  0.0000000
## alzheimers        0.31568268    0.30581106 0.23485872  0.2721016
## arthritis         0.23614435    0.25328348 0.21569437  0.2277253
## osteoporosis      0.19565620    0.22465085 0.18942430  0.2028810
## cancer            0.16987423    0.15428262 0.16477714  0.1378292
## stroke            0.21079850    0.21957931 0.19124785  0.1571124
## age               0.02862807    0.05757169 0.03546447  0.0118616
##                   alzheimers  arthritis osteoporosis     cancer     stroke
## bucket2008        0.36445940 0.31131326   0.24670276 0.30809453 0.30742836
## diabetes          0.35596309 0.31374750   0.27977336 0.18856049 0.18807697
## ihd               0.34480425 0.30973675   0.28444797 0.20274567 0.19837708
## reimbursement2008 0.31961378 0.25334749   0.19493180 0.28160318 0.29564870
## kidney            0.31568268 0.23614435   0.19565620 0.16987423 0.21079850
## heart.failure     0.30581106 0.25328348   0.22465085 0.15428262 0.21957931
## copd              0.23485872 0.21569437   0.18942430 0.16477714 0.19124785
## depression        0.27210156 0.22772525   0.20288098 0.13782921 0.15711241
## alzheimers        0.00000000 0.21052940   0.20145419 0.12966182 0.21387945
## arthritis         0.21052940 0.00000000   0.25750741 0.09819606 0.10817740
## osteoporosis      0.20145419 0.25750741   0.00000000 0.10288245 0.12657256
## cancer            0.12966182 0.09819606   0.10288245 0.00000000 0.06946476
## stroke            0.21387945 0.10817740   0.12657256 0.06946476 0.00000000
## age               0.02391479 0.04090936   0.02569856 0.01201320 0.04427714
##                          age
## bucket2008        0.04109362
## diabetes          0.05090513
## ihd               0.04827641
## reimbursement2008 0.01954279
## kidney            0.02862807
## heart.failure     0.05757169
## copd              0.03546447
## depression        0.01186160
## alzheimers        0.02391479
## arthritis         0.04090936
## osteoporosis      0.02569856
## cancer            0.01201320
## stroke            0.04427714
## age               0.00000000
## [1] "cor(bucket2008, reimbursement2008)=0.8584"

## [1] "cor(bucket2009.fctr, bucket2008)=0.4613"
## [1] "cor(bucket2009.fctr, reimbursement2008)=0.4043"
## geom_smooth: Only one unique x value each group.Maybe you want aes(group = 1)?
## geom_smooth: Only one unique x value each group.Maybe you want aes(group = 1)?
## geom_smooth: Only one unique x value each group.Maybe you want aes(group = 1)?
## geom_smooth: Only one unique x value each group.Maybe you want aes(group = 1)?
## Warning in mydelete_cor_features(glb_feats_df, glb_entity_df, glb_rsp_var,
## : Dropping reimbursement2008 as a feature

##                          id      cor.y  cor.y.abs
## bucket2008       bucket2008 0.46125446 0.46125446
## diabetes           diabetes 0.40896269 0.40896269
## ihd                     ihd 0.40798592 0.40798592
## kidney               kidney 0.38971036 0.38971036
## heart.failure heart.failure 0.37320420 0.37320420
## copd                   copd 0.31969314 0.31969314
## depression       depression 0.28668189 0.28668189
## alzheimers       alzheimers 0.27882540 0.27882540
## arthritis         arthritis 0.25224332 0.25224332
## osteoporosis   osteoporosis 0.21770017 0.21770017
## cancer               cancer 0.19499896 0.19499896
## stroke               stroke 0.16603039 0.16603039
## age                     age 0.02529723 0.02529723
##               bucket2008   diabetes        ihd     kidney heart.failure
## bucket2008    1.00000000 0.44339213 0.46050245 0.55344410    0.48346348
## diabetes      0.44339213 1.00000000 0.50290750 0.42193676    0.44605794
## ihd           0.46050245 0.50290750 1.00000000 0.37718207    0.46628922
## kidney        0.55344410 0.42193676 0.37718207 1.00000000    0.41464490
## heart.failure 0.48346348 0.44605794 0.46628922 0.41464490    1.00000000
## copd          0.47983017 0.32326053 0.34570377 0.36435540    0.38419755
## depression    0.34527821 0.35473684 0.32037384 0.27355830    0.30968997
## alzheimers    0.36445940 0.35596309 0.34480425 0.31568268    0.30581106
## arthritis     0.31131326 0.31374750 0.30973675 0.23614435    0.25328348
## osteoporosis  0.24670276 0.27977336 0.28444797 0.19565620    0.22465085
## cancer        0.30809453 0.18856049 0.20274567 0.16987423    0.15428262
## stroke        0.30742836 0.18807697 0.19837708 0.21079850    0.21957931
## age           0.04109362 0.05090513 0.04827641 0.02862807    0.05757169
##                     copd depression alzheimers  arthritis osteoporosis
## bucket2008    0.47983017  0.3452782 0.36445940 0.31131326   0.24670276
## diabetes      0.32326053  0.3547368 0.35596309 0.31374750   0.27977336
## ihd           0.34570377  0.3203738 0.34480425 0.30973675   0.28444797
## kidney        0.36435540  0.2735583 0.31568268 0.23614435   0.19565620
## heart.failure 0.38419755  0.3096900 0.30581106 0.25328348   0.22465085
## copd          1.00000000  0.2613975 0.23485872 0.21569437   0.18942430
## depression    0.26139746  1.0000000 0.27210156 0.22772525   0.20288098
## alzheimers    0.23485872  0.2721016 1.00000000 0.21052940   0.20145419
## arthritis     0.21569437  0.2277253 0.21052940 1.00000000   0.25750741
## osteoporosis  0.18942430  0.2028810 0.20145419 0.25750741   1.00000000
## cancer        0.16477714  0.1378292 0.12966182 0.09819606   0.10288245
## stroke        0.19124785  0.1571124 0.21387945 0.10817740   0.12657256
## age           0.03546447  0.0118616 0.02391479 0.04090936   0.02569856
##                   cancer     stroke        age
## bucket2008    0.30809453 0.30742836 0.04109362
## diabetes      0.18856049 0.18807697 0.05090513
## ihd           0.20274567 0.19837708 0.04827641
## kidney        0.16987423 0.21079850 0.02862807
## heart.failure 0.15428262 0.21957931 0.05757169
## copd          0.16477714 0.19124785 0.03546447
## depression    0.13782921 0.15711241 0.01186160
## alzheimers    0.12966182 0.21387945 0.02391479
## arthritis     0.09819606 0.10817740 0.04090936
## osteoporosis  0.10288245 0.12657256 0.02569856
## cancer        1.00000000 0.06946476 0.01201320
## stroke        0.06946476 1.00000000 0.04427714
## age           0.01201320 0.04427714 1.00000000
##               bucket2008   diabetes        ihd     kidney heart.failure
## bucket2008    0.00000000 0.44339213 0.46050245 0.55344410    0.48346348
## diabetes      0.44339213 0.00000000 0.50290750 0.42193676    0.44605794
## ihd           0.46050245 0.50290750 0.00000000 0.37718207    0.46628922
## kidney        0.55344410 0.42193676 0.37718207 0.00000000    0.41464490
## heart.failure 0.48346348 0.44605794 0.46628922 0.41464490    0.00000000
## copd          0.47983017 0.32326053 0.34570377 0.36435540    0.38419755
## depression    0.34527821 0.35473684 0.32037384 0.27355830    0.30968997
## alzheimers    0.36445940 0.35596309 0.34480425 0.31568268    0.30581106
## arthritis     0.31131326 0.31374750 0.30973675 0.23614435    0.25328348
## osteoporosis  0.24670276 0.27977336 0.28444797 0.19565620    0.22465085
## cancer        0.30809453 0.18856049 0.20274567 0.16987423    0.15428262
## stroke        0.30742836 0.18807697 0.19837708 0.21079850    0.21957931
## age           0.04109362 0.05090513 0.04827641 0.02862807    0.05757169
##                     copd depression alzheimers  arthritis osteoporosis
## bucket2008    0.47983017  0.3452782 0.36445940 0.31131326   0.24670276
## diabetes      0.32326053  0.3547368 0.35596309 0.31374750   0.27977336
## ihd           0.34570377  0.3203738 0.34480425 0.30973675   0.28444797
## kidney        0.36435540  0.2735583 0.31568268 0.23614435   0.19565620
## heart.failure 0.38419755  0.3096900 0.30581106 0.25328348   0.22465085
## copd          0.00000000  0.2613975 0.23485872 0.21569437   0.18942430
## depression    0.26139746  0.0000000 0.27210156 0.22772525   0.20288098
## alzheimers    0.23485872  0.2721016 0.00000000 0.21052940   0.20145419
## arthritis     0.21569437  0.2277253 0.21052940 0.00000000   0.25750741
## osteoporosis  0.18942430  0.2028810 0.20145419 0.25750741   0.00000000
## cancer        0.16477714  0.1378292 0.12966182 0.09819606   0.10288245
## stroke        0.19124785  0.1571124 0.21387945 0.10817740   0.12657256
## age           0.03546447  0.0118616 0.02391479 0.04090936   0.02569856
##                   cancer     stroke        age
## bucket2008    0.30809453 0.30742836 0.04109362
## diabetes      0.18856049 0.18807697 0.05090513
## ihd           0.20274567 0.19837708 0.04827641
## kidney        0.16987423 0.21079850 0.02862807
## heart.failure 0.15428262 0.21957931 0.05757169
## copd          0.16477714 0.19124785 0.03546447
## depression    0.13782921 0.15711241 0.01186160
## alzheimers    0.12966182 0.21387945 0.02391479
## arthritis     0.09819606 0.10817740 0.04090936
## osteoporosis  0.10288245 0.12657256 0.02569856
## cancer        0.00000000 0.06946476 0.01201320
## stroke        0.06946476 0.00000000 0.04427714
## age           0.01201320 0.04427714 0.00000000
##                   id      cor.y  cor.y.abs cor.low
## 4         bucket2008 0.46125446 0.46125446       1
## 8           diabetes 0.40896269 0.40896269       1
## 10               ihd 0.40798592 0.40798592       1
## 13 reimbursement2008 0.40427780 0.40427780      NA
## 11            kidney 0.38971036 0.38971036       1
## 9      heart.failure 0.37320420 0.37320420       1
## 6               copd 0.31969314 0.31969314       1
## 7         depression 0.28668189 0.28668189       1
## 2         alzheimers 0.27882540 0.27882540       1
## 3          arthritis 0.25224332 0.25224332       1
## 12      osteoporosis 0.21770017 0.21770017       1
## 5             cancer 0.19499896 0.19499896       1
## 14            stroke 0.16603039 0.16603039       1
## 1                age 0.02529723 0.02529723       1
glb_script_df <- rbind(glb_script_df, 
                   data.frame(chunk_label="run.models", 
                              chunk_step_major=max(glb_script_df$chunk_step_major)+1, 
                              chunk_step_minor=0,
                              elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
##                         chunk_label chunk_step_major chunk_step_minor
## elapsed7 remove_correlated_features                4                1
## elapsed8                 run.models                5                0
##          elapsed
## elapsed7  12.506
## elapsed8  13.903

Step 5: run models

max_cor_y_x_var <- subset(glb_feats_df, cor.low == 1)[1, "id"]

#   Regression:
if (glb_is_regression) {
    #   Linear:
    myrun_mdl_fn <- myrun_mdl_lm
}    

#   Classification:
if (glb_is_classification) myrun_mdl_fn <- myrun_mdl_classification 
glb_is_binomial <- (length(unique(glb_entity_df[, glb_rsp_var])) <= 2)
    
# Add dummy model - random variable
ret_lst <- myrun_mdl_fn(indep_vars_vctr=".rnorm",
                         rsp_var=glb_rsp_var, 
                         rsp_var_out=glb_rsp_var_out,
                        fit_df=glb_entity_df, OOB_df=glb_newent_df,
                        method=ifelse(glb_is_binomial, "glm", "rpart"))
## Loading required package: ROCR
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:stats':
## 
##     lowess
## 
## Loading required package: caret
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:survival':
## 
##     cluster
## 
## Loading required package: rpart
## + Fold1: cp=0.0009124 
## - Fold1: cp=0.0009124 
## + Fold2: cp=0.0009124 
## - Fold2: cp=0.0009124 
## + Fold3: cp=0.0009124 
## - Fold3: cp=0.0009124 
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.00142 on full training set
## Loading required package: rpart.plot

## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7, 
##     cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2, 
##     surrogatestyle = 0, maxdepth = 30, xval = 0))
##   n= 5000 
## 
##            CP nsplit rel error
## 1 0.001419303      0         1
## 
## Node number 1: 5000 observations
##   predicted class=B1  expected loss=0.3288  P(node) =1
##     class counts:  3356   951   447   217    29
##    probabilities: 0.671 0.190 0.089 0.043 0.006 
## 
## n= 5000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058) *
glb_dmy_mdl <- ret_lst[["model"]]

# Highest cor.y
ret_lst <- myrun_mdl_fn(indep_vars_vctr=max_cor_y_x_var,
                         rsp_var=glb_rsp_var, 
                         rsp_var_out=glb_rsp_var_out,
                        fit_df=glb_entity_df, OOB_df=glb_newent_df,
                        method=ifelse(glb_is_binomial, "glm", "rpart"))                        
## + Fold1: cp=0 
## - Fold1: cp=0 
## + Fold2: cp=0 
## - Fold2: cp=0 
## + Fold3: cp=0 
## - Fold3: cp=0 
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.000811 on full training set

## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7, 
##     cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2, 
##     surrogatestyle = 0, maxdepth = 30, xval = 0))
##   n= 5000 
## 
##           CP nsplit rel error
## 1 0.09549878      0 1.0000000
## 2 0.00081103      1 0.9045012
## 
## Variable importance
## bucket2008 
##        100 
## 
## Node number 1: 5000 observations,    complexity param=0.09549878
##   predicted class=B1  expected loss=0.3288  P(node) =1
##     class counts:  3356   951   447   217    29
##    probabilities: 0.671 0.190 0.089 0.043 0.006 
##   left son=2 (3724 obs) right son=3 (1276 obs)
##   Primary splits:
##       bucket2008 < 1.5 to the left,  improve=375.5983, (0 missing)
## 
## Node number 2: 3724 observations
##   predicted class=B1  expected loss=0.1922664  P(node) =0.7448
##     class counts:  3008   446   183    78     9
##    probabilities: 0.808 0.120 0.049 0.021 0.002 
## 
## Node number 3: 1276 observations
##   predicted class=B2  expected loss=0.604232  P(node) =0.2552
##     class counts:   348   505   264   139    20
##    probabilities: 0.273 0.396 0.207 0.109 0.016 
## 
## n= 5000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)  
##   2) bucket2008< 1.5 3724  716 B1 (0.81 0.12 0.049 0.021 0.0024) *
##   3) bucket2008>=1.5 1276  771 B2 (0.27 0.4 0.21 0.11 0.016) *
# Enhance Highest cor.y model with additions of interaction terms that were 
#   dropped due to high correlations
if (nrow(subset(glb_feats_df, is.na(cor.low))) > 0) {
    # Only glm handles interaction terms (checked that rpart does not)
    #   This does not work - why ???
#     indep_vars_vctr <- ifelse(glb_is_binomial, 
#         c(max_cor_y_x_var, paste(max_cor_y_x_var, 
#                         subset(glb_feats_df, is.na(cor.low))[, "id"], sep=":")),
#         union(max_cor_y_x_var, subset(glb_feats_df, is.na(cor.low))[, "id"]))
    if (glb_is_binomial) {
        indep_vars_vctr <- 
            c(max_cor_y_x_var, paste(max_cor_y_x_var, 
                        subset(glb_feats_df, is.na(cor.low))[, "id"], sep=":"))       
    } else {
        indep_vars_vctr <- 
            union(max_cor_y_x_var, subset(glb_feats_df, is.na(cor.low))[, "id"])
    }
    ret_lst <- myrun_mdl_fn(indep_vars_vctr,
                        glb_rsp_var, glb_rsp_var_out,
                            fit_df=glb_entity_df, OOB_df=glb_newent_df,
                        method=ifelse(glb_is_binomial, "glm", "rpart"))                        
}    
## + Fold1: cp=0.002839 
## - Fold1: cp=0.002839 
## + Fold2: cp=0.002839 
## - Fold2: cp=0.002839 
## + Fold3: cp=0.002839 
## - Fold3: cp=0.002839 
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.00324 on full training set

## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7, 
##     cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2, 
##     surrogatestyle = 0, maxdepth = 30, xval = 0))
##   n= 5000 
## 
##           CP nsplit rel error
## 1 0.04866180      0 1.0000000
## 2 0.00324412      2 0.9026764
## 
## Variable importance
## reimbursement2008        bucket2008 
##                60                40 
## 
## Node number 1: 5000 observations,    complexity param=0.0486618
##   predicted class=B1  expected loss=0.3288  P(node) =1
##     class counts:  3356   951   447   217    29
##    probabilities: 0.671 0.190 0.089 0.043 0.006 
##   left son=2 (3025 obs) right son=3 (1975 obs)
##   Primary splits:
##       reimbursement2008 < 1545 to the left,  improve=464.1939, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=375.5983, (0 missing)
##   Surrogate splits:
##       bucket2008 < 1.5  to the left,  agree=0.86, adj=0.646, (0 split)
## 
## Node number 2: 3025 observations
##   predicted class=B1  expected loss=0.1209917  P(node) =0.605
##     class counts:  2659   228    98    37     3
##    probabilities: 0.879 0.075 0.032 0.012 0.001 
## 
## Node number 3: 1975 observations,    complexity param=0.0486618
##   predicted class=B2  expected loss=0.6339241  P(node) =0.395
##     class counts:   697   723   349   180    26
##    probabilities: 0.353 0.366 0.177 0.091 0.013 
##   left son=6 (923 obs) right son=7 (1052 obs)
##   Primary splits:
##       reimbursement2008 < 3765 to the left,  improve=38.75805, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=30.80855, (0 missing)
##   Surrogate splits:
##       bucket2008 < 1.5  to the left,  agree=0.887, adj=0.757, (0 split)
## 
## Node number 6: 923 observations
##   predicted class=B1  expected loss=0.5178765  P(node) =0.1846
##     class counts:   445   311   109    50     8
##    probabilities: 0.482 0.337 0.118 0.054 0.009 
## 
## Node number 7: 1052 observations
##   predicted class=B2  expected loss=0.608365  P(node) =0.2104
##     class counts:   252   412   240   130    18
##    probabilities: 0.240 0.392 0.228 0.124 0.017 
## 
## n= 5000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)  
##   2) reimbursement2008< 1545 3025  366 B1 (0.88 0.075 0.032 0.012 0.00099) *
##   3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)  
##     6) reimbursement2008< 3765 923  478 B1 (0.48 0.34 0.12 0.054 0.0087) *
##     7) reimbursement2008>=3765 1052  640 B2 (0.24 0.39 0.23 0.12 0.017) *
# Low correlated X
ret_lst <- myrun_mdl_fn(indep_vars_vctr=subset(glb_feats_df, 
                                               cor.low == 1)[, "id"],
                        glb_rsp_var, glb_rsp_var_out,
                        fit_df=glb_entity_df, OOB_df=glb_newent_df,
                        method=ifelse(glb_is_binomial, "glm", "rpart"))                        
## + Fold1: cp=0.002585 
## - Fold1: cp=0.002585 
## + Fold2: cp=0.002585 
## - Fold2: cp=0.002585 
## + Fold3: cp=0.002585 
## - Fold3: cp=0.002585 
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0176 on full training set

## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7, 
##     cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2, 
##     surrogatestyle = 0, maxdepth = 30, xval = 0))
##   n= 5000 
## 
##           CP nsplit rel error
## 1 0.09549878      0 1.0000000
## 2 0.01763990      1 0.9045012
## 
## Variable importance
##    bucket2008        kidney          copd heart.failure        cancer 
##            51            15            13             9             6 
##     arthritis 
##             6 
## 
## Node number 1: 5000 observations,    complexity param=0.09549878
##   predicted class=B1  expected loss=0.3288  P(node) =1
##     class counts:  3356   951   447   217    29
##    probabilities: 0.671 0.190 0.089 0.043 0.006 
##   left son=2 (3724 obs) right son=3 (1276 obs)
##   Primary splits:
##       bucket2008    < 1.5 to the left,  improve=375.5983, (0 missing)
##       diabetes      < 0.5 to the left,  improve=323.2704, (0 missing)
##       ihd           < 0.5 to the left,  improve=319.3596, (0 missing)
##       heart.failure < 0.5 to the left,  improve=227.0499, (0 missing)
##       kidney        < 0.5 to the left,  improve=213.8953, (0 missing)
##   Surrogate splits:
##       kidney        < 0.5 to the left,  agree=0.821, adj=0.299, (0 split)
##       copd          < 0.5 to the left,  agree=0.807, adj=0.245, (0 split)
##       heart.failure < 0.5 to the left,  agree=0.788, adj=0.168, (0 split)
##       cancer        < 0.5 to the left,  agree=0.774, adj=0.116, (0 split)
##       arthritis     < 0.5 to the left,  agree=0.774, adj=0.114, (0 split)
## 
## Node number 2: 3724 observations
##   predicted class=B1  expected loss=0.1922664  P(node) =0.7448
##     class counts:  3008   446   183    78     9
##    probabilities: 0.808 0.120 0.049 0.021 0.002 
## 
## Node number 3: 1276 observations
##   predicted class=B2  expected loss=0.604232  P(node) =0.2552
##     class counts:   348   505   264   139    20
##    probabilities: 0.273 0.396 0.207 0.109 0.016 
## 
## n= 5000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)  
##   2) bucket2008< 1.5 3724  716 B1 (0.81 0.12 0.049 0.021 0.0024) *
##   3) bucket2008>=1.5 1276  771 B2 (0.27 0.4 0.21 0.11 0.016) *
# User specified
for (method in glb_models_method_vctr) {
    #print(sprintf("iterating over method:%s", method))

    # All X that is not user excluded
    indep_vars_vctr <- setdiff(names(glb_entity_df), 
        union(glb_rsp_var, glb_exclude_vars_as_features))
    
    # easier to exclude features
#     indep_vars_vctr <- setdiff(names(glb_entity_df), 
#         union(union(glb_rsp_var, glb_exclude_vars_as_features), 
#               c("<feat1_name>", "<feat2_name>")))
    
    # easier to include features
#     indep_vars_vctr <- c("<feat1_name>", "<feat2_name>")

    # User specified bivariate models
#     indep_vars_vctr_lst <- list()
#     for (feat in setdiff(names(glb_entity_df), 
#                          union(glb_rsp_var, glb_exclude_vars_as_features)))
#         indep_vars_vctr_lst[["feat"]] <- feat

    # User specified combinatorial models
#     indep_vars_vctr_lst <- list()
#     combn_mtrx <- combn(c("<feat1_name>", "<feat2_name>", "<featn_name>"), 
#                           <num_feats_to_choose>)
#     for (combn_ix in 1:ncol(combn_mtrx))
#         #print(combn_mtrx[, combn_ix])
#         indep_vars_vctr_lst[[combn_ix]] <- combn_mtrx[, combn_ix]

    set.seed(200)
    ret_lst <- myrun_mdl_fn(indep_vars_vctr=indep_vars_vctr,
                            rsp_var=glb_rsp_var, 
                            rsp_var_out=glb_rsp_var_out,
                            fit_df=glb_entity_df, 
                            OOB_df=glb_newent_df,
                            method=method, 
                            tune_models_df=glb_tune_models_df,
                            n_cv_folds=glb_n_cv_folds)
    glb_sel_nlm_mdl <- ret_lst[["model"]]
    rpart_sel_nlm_mdl <- rpart(reformulate(indep_vars_vctr, response=glb_rsp_var), 
                               data=glb_entity_df, method="class", 
                               control=rpart.control(cp=glb_sel_nlm_mdl$bestTune$cp))

    set.seed(201)
    ret_lst <- myrun_mdl_fn(indep_vars_vctr=indep_vars_vctr,
                            rsp_var=glb_rsp_var, 
                            rsp_var_out=glb_rsp_var_out,
                            fit_df=glb_entity_df, 
                            OOB_df=glb_newent_df,
                            method=method, 
                            tune_models_df=glb_tune_models_df,
                            n_cv_folds=glb_n_cv_folds,
                            loss_mtrx=glb_loss_mtrx,
                            summaryFunction=glb_loss_smmry,
                            metric="loss.error",
                            maximize=FALSE)
    glb_sel_wlm_mdl <- ret_lst[["model"]]
    rpart_sel_wlm_mdl <- rpart(reformulate(indep_vars_vctr, response=glb_rsp_var), 
                               data=glb_entity_df, method="class", 
                               control=rpart.control(cp=glb_sel_wlm_mdl$bestTune$cp),
                           parms=list(loss=glb_loss_mtrx))

    set.seed(201)
    ret_lst <- myrun_mdl_fn(indep_vars_vctr=indep_vars_vctr,
                            rsp_var=glb_rsp_var, 
                            rsp_var_out=glb_rsp_var_out,
                            fit_df=glb_entity_df, 
                            OOB_df=glb_newent_df,
                            method=method, 
                            tune_models_df=glb_tune_models_df,
                            n_cv_folds=glb_n_cv_folds,
                            loss_mtrx=t(glb_loss_mtrx),
                            summaryFunction=glb_loss_smmry_t,
                            metric="loss.error",
                            maximize=FALSE)
    glb_sel_mdl <- glb_sel_tlm_mdl <- ret_lst[["model"]]
    rpart_sel_tlm_mdl <- rpart(reformulate(indep_vars_vctr, response=glb_rsp_var), 
                               data=glb_entity_df, method="class", 
                               control=rpart.control(cp=glb_sel_tlm_mdl$bestTune$cp),
                           parms=list(loss=glb_loss_mtrx))
}
## + Fold1: cp=0 
## - Fold1: cp=0 
## + Fold2: cp=0 
## - Fold2: cp=0 
## + Fold3: cp=0 
## - Fold3: cp=0 
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.01 on full training set

## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7, 
##     cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2, 
##     surrogatestyle = 0, maxdepth = 30, xval = 0))
##   n= 5000 
## 
##           CP nsplit rel error
## 1 0.04866180      0 1.0000000
## 2 0.01094891      2 0.9026764
## 3 0.01000000      3 0.8917275
## 
## Variable importance
## reimbursement2008        bucket2008               ihd          diabetes 
##                31                20                14                13 
##     heart.failure            kidney 
##                12                10 
## 
## Node number 1: 5000 observations,    complexity param=0.0486618
##   predicted class=B1  expected loss=0.3288  P(node) =1
##     class counts:  3356   951   447   217    29
##    probabilities: 0.671 0.190 0.089 0.043 0.006 
##   left son=2 (3025 obs) right son=3 (1975 obs)
##   Primary splits:
##       reimbursement2008 < 1545 to the left,  improve=464.1939, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=375.5983, (0 missing)
##       diabetes          < 0.5  to the left,  improve=323.2704, (0 missing)
##       ihd               < 0.5  to the left,  improve=319.3596, (0 missing)
##       heart.failure     < 0.5  to the left,  improve=227.0499, (0 missing)
##   Surrogate splits:
##       bucket2008    < 1.5  to the left,  agree=0.860, adj=0.646, (0 split)
##       ihd           < 0.5  to the left,  agree=0.790, adj=0.467, (0 split)
##       diabetes      < 0.5  to the left,  agree=0.784, adj=0.454, (0 split)
##       heart.failure < 0.5  to the left,  agree=0.764, adj=0.403, (0 split)
##       kidney        < 0.5  to the left,  agree=0.730, adj=0.316, (0 split)
## 
## Node number 2: 3025 observations
##   predicted class=B1  expected loss=0.1209917  P(node) =0.605
##     class counts:  2659   228    98    37     3
##    probabilities: 0.879 0.075 0.032 0.012 0.001 
## 
## Node number 3: 1975 observations,    complexity param=0.0486618
##   predicted class=B2  expected loss=0.6339241  P(node) =0.395
##     class counts:   697   723   349   180    26
##    probabilities: 0.353 0.366 0.177 0.091 0.013 
##   left son=6 (923 obs) right son=7 (1052 obs)
##   Primary splits:
##       reimbursement2008 < 3765 to the left,  improve=38.75805, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=30.80855, (0 missing)
##       kidney            < 0.5  to the left,  improve=28.52915, (0 missing)
##       ihd               < 0.5  to the left,  improve=27.61120, (0 missing)
##       diabetes          < 0.5  to the left,  improve=23.99424, (0 missing)
##   Surrogate splits:
##       bucket2008    < 1.5  to the left,  agree=0.887, adj=0.757, (0 split)
##       kidney        < 0.5  to the left,  agree=0.660, adj=0.272, (0 split)
##       heart.failure < 0.5  to the left,  agree=0.639, adj=0.228, (0 split)
##       copd          < 0.5  to the left,  agree=0.622, adj=0.191, (0 split)
##       ihd           < 0.5  to the left,  agree=0.607, adj=0.159, (0 split)
## 
## Node number 6: 923 observations,    complexity param=0.01094891
##   predicted class=B1  expected loss=0.5178765  P(node) =0.1846
##     class counts:   445   311   109    50     8
##    probabilities: 0.482 0.337 0.118 0.054 0.009 
##   left son=12 (754 obs) right son=13 (169 obs)
##   Primary splits:
##       kidney            < 0.5  to the left,  improve=7.902130, (0 missing)
##       diabetes          < 0.5  to the left,  improve=6.451528, (0 missing)
##       ihd               < 0.5  to the left,  improve=6.445818, (0 missing)
##       reimbursement2008 < 2545 to the left,  improve=5.585770, (0 missing)
##       arthritis         < 0.5  to the left,  improve=5.444392, (0 missing)
##   Surrogate splits:
##       reimbursement2008 < 3755 to the left,  agree=0.818, adj=0.006, (0 split)
## 
## Node number 7: 1052 observations
##   predicted class=B2  expected loss=0.608365  P(node) =0.2104
##     class counts:   252   412   240   130    18
##    probabilities: 0.240 0.392 0.228 0.124 0.017 
## 
## Node number 12: 754 observations
##   predicted class=B1  expected loss=0.4814324  P(node) =0.1508
##     class counts:   391   239    79    38     7
##    probabilities: 0.519 0.317 0.105 0.050 0.009 
## 
## Node number 13: 169 observations
##   predicted class=B2  expected loss=0.5739645  P(node) =0.0338
##     class counts:    54    72    30    12     1
##    probabilities: 0.320 0.426 0.178 0.071 0.006 
## 
## n= 5000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)  
##    2) reimbursement2008< 1545 3025  366 B1 (0.88 0.075 0.032 0.012 0.00099) *
##    3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)  
##      6) reimbursement2008< 3765 923  478 B1 (0.48 0.34 0.12 0.054 0.0087)  
##       12) kidney< 0.5 754  363 B1 (0.52 0.32 0.1 0.05 0.0093) *
##       13) kidney>=0.5 169   97 B2 (0.32 0.43 0.18 0.071 0.0059) *
##      7) reimbursement2008>=3765 1052  640 B2 (0.24 0.39 0.23 0.12 0.017) *
##    B1 B2 B3 B4 B5
## B1  7  1  1  0  0
## B2  0  0  0  0  0
## B3  0  1  0  0  0
## B4  0  0  0  0  0
## B5  0  0  0  0  0
## + Fold1: cp=0 
##     B1  B2 B3 B4 B5
## B1 966 125 23  5  0
## B2 167 112 32  6  0
## B3  54  61 23 11  0
## B4  27  27 17  2  0
## B5   2   5  1  1  0
##      B1  B2 B3 B4 B5
## B1 1014 105  0  0  0
## B2  170 147  0  0  0
## B3   59  90  0  0  0
## B4   28  45  0  0  0
## B5    1   8  0  0  0
##      B1  B2 B3 B4 B5
## B1 1034  85  0  0  0
## B2  187 130  0  0  0
## B3   67  82  0  0  0
## B4   32  41  0  0  0
## B5    2   7  0  0  0
##      B1  B2 B3 B4 B5
## B1 1034  85  0  0  0
## B2  187 130  0  0  0
## B3   67  82  0  0  0
## B4   32  41  0  0  0
## B5    2   7  0  0  0
##      B1  B2 B3 B4 B5
## B1 1034  85  0  0  0
## B2  187 130  0  0  0
## B3   67  82  0  0  0
## B4   32  41  0  0  0
## B5    2   7  0  0  0
## - Fold1: cp=0 
## + Fold2: cp=0 
##     B1  B2 B3 B4 B5
## B1 975 115 27  2  0
## B2 157  98 54  8  0
## B3  63  56 28  2  0
## B4  29  21 18  4  0
## B5   4   3  2  1  0
##      B1  B2 B3 B4 B5
## B1 1010 109  0  0  0
## B2  141 176  0  0  0
## B3   62  87  0  0  0
## B4   28  44  0  0  0
## B5    4   6  0  0  0
##      B1  B2 B3 B4 B5
## B1 1034  85  0  0  0
## B2  179 138  0  0  0
## B3   74  75  0  0  0
## B4   32  40  0  0  0
## B5    5   5  0  0  0
##     B1  B2 B3 B4 B5
## B1 981 138  0  0  0
## B2 132 185  0  0  0
## B3  54  95  0  0  0
## B4  23  49  0  0  0
## B5   4   6  0  0  0
##      B1 B2 B3 B4 B5
## B1 1119  0  0  0  0
## B2  317  0  0  0  0
## B3  149  0  0  0  0
## B4   72  0  0  0  0
## B5   10  0  0  0  0
## - Fold2: cp=0 
## + Fold3: cp=0 
##     B1 B2 B3 B4 B5
## B1 998 86 23 11  0
## B2 170 99 36 12  0
## B3  78 48 15  8  0
## B4  29 24 13  6  0
## B5   2  3  0  5  0
##      B1  B2 B3 B4 B5
## B1 1036  82  0  0  0
## B2  193 124  0  0  0
## B3   74  75  0  0  0
## B4   25  47  0  0  0
## B5    4   6  0  0  0
##      B1  B2 B3 B4 B5
## B1 1036  82  0  0  0
## B2  193 124  0  0  0
## B3   74  75  0  0  0
## B4   25  47  0  0  0
## B5    4   6  0  0  0
##      B1  B2 B3 B4 B5
## B1 1036  82  0  0  0
## B2  193 124  0  0  0
## B3   74  75  0  0  0
## B4   25  47  0  0  0
## B5    4   6  0  0  0
##      B1  B2 B3 B4 B5
## B1 1036  82  0  0  0
## B2  193 124  0  0  0
## B3   74  75  0  0  0
## B4   25  47  0  0  0
## B5    4   6  0  0  0
## - Fold3: cp=0 
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.01 on full training set

## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7, 
##     cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2, 
##     surrogatestyle = 0, maxdepth = 30, xval = 0))
##   n= 5000 
## 
##           CP nsplit rel error
## 1 0.04866180      0 1.0000000
## 2 0.01094891      2 0.9026764
## 3 0.01000000      3 0.8917275
## 
## Variable importance
## reimbursement2008        bucket2008               ihd          diabetes 
##                31                20                14                13 
##     heart.failure            kidney 
##                12                10 
## 
## Node number 1: 5000 observations,    complexity param=0.0486618
##   predicted class=B1  expected loss=0.3288  P(node) =1
##     class counts:  3356   951   447   217    29
##    probabilities: 0.671 0.190 0.089 0.043 0.006 
##   left son=2 (3025 obs) right son=3 (1975 obs)
##   Primary splits:
##       reimbursement2008 < 1545 to the left,  improve=464.1939, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=375.5983, (0 missing)
##       diabetes          < 0.5  to the left,  improve=323.2704, (0 missing)
##       ihd               < 0.5  to the left,  improve=319.3596, (0 missing)
##       heart.failure     < 0.5  to the left,  improve=227.0499, (0 missing)
##   Surrogate splits:
##       bucket2008    < 1.5  to the left,  agree=0.860, adj=0.646, (0 split)
##       ihd           < 0.5  to the left,  agree=0.790, adj=0.467, (0 split)
##       diabetes      < 0.5  to the left,  agree=0.784, adj=0.454, (0 split)
##       heart.failure < 0.5  to the left,  agree=0.764, adj=0.403, (0 split)
##       kidney        < 0.5  to the left,  agree=0.730, adj=0.316, (0 split)
## 
## Node number 2: 3025 observations
##   predicted class=B1  expected loss=0.1209917  P(node) =0.605
##     class counts:  2659   228    98    37     3
##    probabilities: 0.879 0.075 0.032 0.012 0.001 
## 
## Node number 3: 1975 observations,    complexity param=0.0486618
##   predicted class=B2  expected loss=0.6339241  P(node) =0.395
##     class counts:   697   723   349   180    26
##    probabilities: 0.353 0.366 0.177 0.091 0.013 
##   left son=6 (923 obs) right son=7 (1052 obs)
##   Primary splits:
##       reimbursement2008 < 3765 to the left,  improve=38.75805, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=30.80855, (0 missing)
##       kidney            < 0.5  to the left,  improve=28.52915, (0 missing)
##       ihd               < 0.5  to the left,  improve=27.61120, (0 missing)
##       diabetes          < 0.5  to the left,  improve=23.99424, (0 missing)
##   Surrogate splits:
##       bucket2008    < 1.5  to the left,  agree=0.887, adj=0.757, (0 split)
##       kidney        < 0.5  to the left,  agree=0.660, adj=0.272, (0 split)
##       heart.failure < 0.5  to the left,  agree=0.639, adj=0.228, (0 split)
##       copd          < 0.5  to the left,  agree=0.622, adj=0.191, (0 split)
##       ihd           < 0.5  to the left,  agree=0.607, adj=0.159, (0 split)
## 
## Node number 6: 923 observations,    complexity param=0.01094891
##   predicted class=B1  expected loss=0.5178765  P(node) =0.1846
##     class counts:   445   311   109    50     8
##    probabilities: 0.482 0.337 0.118 0.054 0.009 
##   left son=12 (754 obs) right son=13 (169 obs)
##   Primary splits:
##       kidney            < 0.5  to the left,  improve=7.902130, (0 missing)
##       diabetes          < 0.5  to the left,  improve=6.451528, (0 missing)
##       ihd               < 0.5  to the left,  improve=6.445818, (0 missing)
##       reimbursement2008 < 2545 to the left,  improve=5.585770, (0 missing)
##       arthritis         < 0.5  to the left,  improve=5.444392, (0 missing)
##   Surrogate splits:
##       reimbursement2008 < 3755 to the left,  agree=0.818, adj=0.006, (0 split)
## 
## Node number 7: 1052 observations
##   predicted class=B2  expected loss=0.608365  P(node) =0.2104
##     class counts:   252   412   240   130    18
##    probabilities: 0.240 0.392 0.228 0.124 0.017 
## 
## Node number 12: 754 observations
##   predicted class=B1  expected loss=0.4814324  P(node) =0.1508
##     class counts:   391   239    79    38     7
##    probabilities: 0.519 0.317 0.105 0.050 0.009 
## 
## Node number 13: 169 observations
##   predicted class=B2  expected loss=0.5739645  P(node) =0.0338
##     class counts:    54    72    30    12     1
##    probabilities: 0.320 0.426 0.178 0.071 0.006 
## 
## n= 5000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)  
##    2) reimbursement2008< 1545 3025  366 B1 (0.88 0.075 0.032 0.012 0.00099) *
##    3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)  
##      6) reimbursement2008< 3765 923  478 B1 (0.48 0.34 0.12 0.054 0.0087)  
##       12) kidney< 0.5 754  363 B1 (0.52 0.32 0.1 0.05 0.0093) *
##       13) kidney>=0.5 169   97 B2 (0.32 0.43 0.18 0.071 0.0059) *
##      7) reimbursement2008>=3765 1052  640 B2 (0.24 0.39 0.23 0.12 0.017) *
##    B1 B2 B3 B4 B5
## B1  7  1  1  0  0
## B2  0  0  0  0  0
## B3  0  1  0  0  0
## B4  0  0  0  0  0
## B5  0  0  0  0  0
## + Fold1: cp=0 
##     B1  B2 B3 B4 B5
## B1 966 125 23  5  0
## B2 167 112 32  6  0
## B3  54  61 23 11  0
## B4  27  27 17  2  0
## B5   2   5  1  1  0
##      B1  B2 B3 B4 B5
## B1 1014 105  0  0  0
## B2  170 147  0  0  0
## B3   59  90  0  0  0
## B4   28  45  0  0  0
## B5    1   8  0  0  0
##      B1  B2 B3 B4 B5
## B1 1034  85  0  0  0
## B2  187 130  0  0  0
## B3   67  82  0  0  0
## B4   32  41  0  0  0
## B5    2   7  0  0  0
##      B1  B2 B3 B4 B5
## B1 1034  85  0  0  0
## B2  187 130  0  0  0
## B3   67  82  0  0  0
## B4   32  41  0  0  0
## B5    2   7  0  0  0
##      B1  B2 B3 B4 B5
## B1 1034  85  0  0  0
## B2  187 130  0  0  0
## B3   67  82  0  0  0
## B4   32  41  0  0  0
## B5    2   7  0  0  0
## - Fold1: cp=0 
## + Fold2: cp=0 
##     B1  B2 B3 B4 B5
## B1 975 115 27  2  0
## B2 157  98 54  8  0
## B3  63  56 28  2  0
## B4  29  21 18  4  0
## B5   4   3  2  1  0
##      B1  B2 B3 B4 B5
## B1 1010 109  0  0  0
## B2  141 176  0  0  0
## B3   62  87  0  0  0
## B4   28  44  0  0  0
## B5    4   6  0  0  0
##      B1  B2 B3 B4 B5
## B1 1034  85  0  0  0
## B2  179 138  0  0  0
## B3   74  75  0  0  0
## B4   32  40  0  0  0
## B5    5   5  0  0  0
##     B1  B2 B3 B4 B5
## B1 981 138  0  0  0
## B2 132 185  0  0  0
## B3  54  95  0  0  0
## B4  23  49  0  0  0
## B5   4   6  0  0  0
##      B1 B2 B3 B4 B5
## B1 1119  0  0  0  0
## B2  317  0  0  0  0
## B3  149  0  0  0  0
## B4   72  0  0  0  0
## B5   10  0  0  0  0
## - Fold2: cp=0 
## + Fold3: cp=0 
##     B1 B2 B3 B4 B5
## B1 998 86 23 11  0
## B2 170 99 36 12  0
## B3  78 48 15  8  0
## B4  29 24 13  6  0
## B5   2  3  0  5  0
##      B1  B2 B3 B4 B5
## B1 1036  82  0  0  0
## B2  193 124  0  0  0
## B3   74  75  0  0  0
## B4   25  47  0  0  0
## B5    4   6  0  0  0
##      B1  B2 B3 B4 B5
## B1 1036  82  0  0  0
## B2  193 124  0  0  0
## B3   74  75  0  0  0
## B4   25  47  0  0  0
## B5    4   6  0  0  0
##      B1  B2 B3 B4 B5
## B1 1036  82  0  0  0
## B2  193 124  0  0  0
## B3   74  75  0  0  0
## B4   25  47  0  0  0
## B5    4   6  0  0  0
##      B1  B2 B3 B4 B5
## B1 1036  82  0  0  0
## B2  193 124  0  0  0
## B3   74  75  0  0  0
## B4   25  47  0  0  0
## B5    4   6  0  0  0
## - Fold3: cp=0 
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.02 on full training set

## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7, 
##     cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2, 
##     surrogatestyle = 0, maxdepth = 30, xval = 0))
##   n= 5000 
## 
##          CP nsplit rel error
## 1 0.0486618      0 1.0000000
## 2 0.0200000      2 0.9026764
## 
## Variable importance
## reimbursement2008        bucket2008               ihd          diabetes 
##                31                20                14                13 
##     heart.failure            kidney 
##                12                10 
## 
## Node number 1: 5000 observations,    complexity param=0.0486618
##   predicted class=B1  expected loss=0.3288  P(node) =1
##     class counts:  3356   951   447   217    29
##    probabilities: 0.671 0.190 0.089 0.043 0.006 
##   left son=2 (3025 obs) right son=3 (1975 obs)
##   Primary splits:
##       reimbursement2008 < 1545 to the left,  improve=464.1939, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=375.5983, (0 missing)
##       diabetes          < 0.5  to the left,  improve=323.2704, (0 missing)
##       ihd               < 0.5  to the left,  improve=319.3596, (0 missing)
##       heart.failure     < 0.5  to the left,  improve=227.0499, (0 missing)
##   Surrogate splits:
##       bucket2008    < 1.5  to the left,  agree=0.860, adj=0.646, (0 split)
##       ihd           < 0.5  to the left,  agree=0.790, adj=0.467, (0 split)
##       diabetes      < 0.5  to the left,  agree=0.784, adj=0.454, (0 split)
##       heart.failure < 0.5  to the left,  agree=0.764, adj=0.403, (0 split)
##       kidney        < 0.5  to the left,  agree=0.730, adj=0.316, (0 split)
## 
## Node number 2: 3025 observations
##   predicted class=B1  expected loss=0.1209917  P(node) =0.605
##     class counts:  2659   228    98    37     3
##    probabilities: 0.879 0.075 0.032 0.012 0.001 
## 
## Node number 3: 1975 observations,    complexity param=0.0486618
##   predicted class=B2  expected loss=0.6339241  P(node) =0.395
##     class counts:   697   723   349   180    26
##    probabilities: 0.353 0.366 0.177 0.091 0.013 
##   left son=6 (923 obs) right son=7 (1052 obs)
##   Primary splits:
##       reimbursement2008 < 3765 to the left,  improve=38.75805, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=30.80855, (0 missing)
##       kidney            < 0.5  to the left,  improve=28.52915, (0 missing)
##       ihd               < 0.5  to the left,  improve=27.61120, (0 missing)
##       diabetes          < 0.5  to the left,  improve=23.99424, (0 missing)
##   Surrogate splits:
##       bucket2008    < 1.5  to the left,  agree=0.887, adj=0.757, (0 split)
##       kidney        < 0.5  to the left,  agree=0.660, adj=0.272, (0 split)
##       heart.failure < 0.5  to the left,  agree=0.639, adj=0.228, (0 split)
##       copd          < 0.5  to the left,  agree=0.622, adj=0.191, (0 split)
##       ihd           < 0.5  to the left,  agree=0.607, adj=0.159, (0 split)
## 
## Node number 6: 923 observations
##   predicted class=B1  expected loss=0.5178765  P(node) =0.1846
##     class counts:   445   311   109    50     8
##    probabilities: 0.482 0.337 0.118 0.054 0.009 
## 
## Node number 7: 1052 observations
##   predicted class=B2  expected loss=0.608365  P(node) =0.2104
##     class counts:   252   412   240   130    18
##    probabilities: 0.240 0.392 0.228 0.124 0.017 
## 
## n= 5000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)  
##   2) reimbursement2008< 1545 3025  366 B1 (0.88 0.075 0.032 0.012 0.00099) *
##   3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)  
##     6) reimbursement2008< 3765 923  478 B1 (0.48 0.34 0.12 0.054 0.0087) *
##     7) reimbursement2008>=3765 1052  640 B2 (0.24 0.39 0.23 0.12 0.017) *
# Simplify a model
# fit_df <- glb_entity_df; glb_mdl <- step(<complex>_mdl)

rownames(glb_models_df) <- seq(1:nrow(glb_models_df))
plot_models_df <- mutate(glb_models_df, feats.label=substr(feats, 1, 30))
if (glb_is_regression) {
    print(orderBy(~ -R.sq.OOB -Adj.R.sq.fit, glb_models_df))
    stop("glb_sel_mdl not selected")
    print(myplot_scatter(plot_models_df, "Adj.R.sq.fit", "R.sq.OOB") + 
          geom_text(aes(label=feats.label), data=plot_models_df, color="NavyBlue", 
                    size=3.5, angle=45))
}    

if (glb_is_classification) {
    # Lower AIC is better
    sel_frmla <- formula(paste("~ ", paste0("-auc.OOB", "-accuracy.fit", "+accuracySD.fit")))
    print(tmp_models_df <- orderBy(sel_frmla, glb_models_df))    
#     print(tmp_models_sav_df <- orderBy(~ -auc.OOB -accuracy.fit +accuracySD.fit, glb_models_df))
    print("Best model using metrics:")
#     print(sprintf("Best model using metrics:", sel_frmla))
#    print(summary(bst_mdl <- glb_models_lst[[as.numeric(rownames(tmp_models_df)[1])]]))
    myprint_mdl(bst_mdl <- glb_models_lst[[as.numeric(rownames(tmp_models_df)[1])]])

    if (!is.null(glb_sel_mdl)) { # Model is user selected
        print("User selected model: ")
        myprint_mdl(glb_sel_mdl)
    } else glb_sel_mdl <- bst_mdl
   
#     glb_sel_mdl <- glb_models_lst[[as.numeric(rownames(tmp_models_df)[1])]]
#     print(summary(glb_sel_mdl))
#     print("Selected model:glb_sel_mdl:")    
#     glb_sel_mdl <- glb_models_lst[[as.numeric(rownames(tmp_models_df)[1])]]
#     print(summary(glb_sel_mdl))    
    
    plot_models_df[, "inv.AIC.fit"] <- 1.0 / plot_models_df[, "AIC.fit"] 
    if (any(!is.na(plot_models_df$inv.AIC.fit)) | any(!is.na(plot_models_df$auc.OOB))) {
        print(myplot_scatter(plot_models_df, "inv.AIC.fit", "auc.OOB") + 
              geom_text(aes(label=feats.label), data=plot_models_df, color="NavyBlue", 
                        size=3.5, angle=45))
    } else warning("All NAs for inv.AIC.fit vs. auc.OOB scatter plot of glb_models_df")
    
    if (any(!is.na(plot_models_df$auc.OOB))) {
        print(myplot_scatter(plot_models_df, "auc.OOB", "accuracy.fit",  
                         colorcol_name="method") + 
          geom_errorbar(aes(x=auc.OOB, ymin=accuracy.fit - accuracySD.fit,
                            ymax=accuracy.fit + accuracySD.fit), 
            width=(max(plot_models_df$auc.OOB)-min(plot_models_df$auc.OOB))/25) +      
          geom_text(aes(label=feats.label), data=plot_models_df, color="NavyBlue", 
                    size=3.5, angle=45))
    } else {
        warning("All NAs for auc.OOB in auc.OOB vs. accuracy.fit scatter plot of glb_models_df")
        print(ggplot(plot_models_df, aes(x=reorder(feats.label, accuracy.fit), y=accuracy.fit, 
                                         fill=factor(method))) +
                  geom_bar(stat="identity", position="dodge") + 
          geom_errorbar(aes(x=feats.label, ymin=accuracy.fit - accuracySD.fit,
                            ymax=accuracy.fit + accuracySD.fit, color=factor(method)), 
            width=0.2) +      
          theme(axis.text.x = element_text(angle = 45,vjust = 1)))        
    }    
    
    # mdl$times plot across models
    print(myplot_scatter(plot_models_df, "inv.elapsedtime.everything", "inv.elapsedtime.final",  
                     colorcol_name="method") + 
          geom_point(aes(size=accuracy.fit)) +      
          geom_text(aes(label=feats.label), data=plot_models_df, color="NavyBlue", 
                    size=3.5, angle=45) + 
        geom_smooth(method="lm"))
}
##   method
## 5  rpart
## 4  rpart
## 2  rpart
## 3  rpart
## 1  rpart
## 6  rpart
## 7  rpart
##                                                                                                                                             feats
## 5 age, alzheimers, arthritis, cancer, copd, depression, diabetes, heart.failure, ihd, kidney, osteoporosis, stroke, reimbursement2008, bucket2008
## 4                    bucket2008, diabetes, ihd, kidney, heart.failure, copd, depression, alzheimers, arthritis, osteoporosis, cancer, stroke, age
## 2                                                                                                                                      bucket2008
## 3                                                                                                                   bucket2008, reimbursement2008
## 1                                                                                                                                          .rnorm
## 6 age, alzheimers, arthritis, cancer, copd, depression, diabetes, heart.failure, ihd, kidney, osteoporosis, stroke, reimbursement2008, bucket2008
## 7 age, alzheimers, arthritis, cancer, copd, depression, diabetes, heart.failure, ihd, kidney, osteoporosis, stroke, reimbursement2008, bucket2008
##   n.fit inv.elapsedtime.everything inv.elapsedtime.final R.sq.fit R.sq.OOB
## 5  5000                  0.5903188              4.255319       NA       NA
## 4  5000                  0.5485464              4.484305       NA       NA
## 2  5000                  0.9680542             13.698630       NA       NA
## 3  5000                  0.8680556             10.204082       NA       NA
## 1  5000                  0.6493506              9.523810       NA       NA
## 6  5000                  0.4555809              4.201681       NA       NA
## 7  5000                  0.5099439              4.273504       NA       NA
##   Adj.R.sq.fit SSE.fit SSE.OOB AIC.fit auc.fit auc.OOB accuracy.fit
## 5           NA       0      NA      NA      NA      NA    0.7044028
## 4           NA       0      NA      NA      NA      NA    0.7040027
## 2           NA       0      NA      NA      NA      NA    0.7025995
## 3           NA       0      NA      NA      NA      NA    0.6950006
## 1           NA       0      NA      NA      NA      NA    0.6714002
## 6           NA       0      NA      NA      NA      NA           NA
## 7           NA       0      NA      NA      NA      NA           NA
##   accuracySD.fit
## 5   0.0122012896
## 4   0.0125838480
## 2   0.0063929795
## 3   0.0082416065
## 1   0.0007592686
## 6             NA
## 7             NA
## [1] "Best model using metrics:"

## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7, 
##     cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2, 
##     surrogatestyle = 0, maxdepth = 30, xval = 0))
##   n= 5000 
## 
##           CP nsplit rel error
## 1 0.04866180      0 1.0000000
## 2 0.01094891      2 0.9026764
## 3 0.01000000      3 0.8917275
## 
## Variable importance
## reimbursement2008        bucket2008               ihd          diabetes 
##                31                20                14                13 
##     heart.failure            kidney 
##                12                10 
## 
## Node number 1: 5000 observations,    complexity param=0.0486618
##   predicted class=B1  expected loss=0.3288  P(node) =1
##     class counts:  3356   951   447   217    29
##    probabilities: 0.671 0.190 0.089 0.043 0.006 
##   left son=2 (3025 obs) right son=3 (1975 obs)
##   Primary splits:
##       reimbursement2008 < 1545 to the left,  improve=464.1939, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=375.5983, (0 missing)
##       diabetes          < 0.5  to the left,  improve=323.2704, (0 missing)
##       ihd               < 0.5  to the left,  improve=319.3596, (0 missing)
##       heart.failure     < 0.5  to the left,  improve=227.0499, (0 missing)
##   Surrogate splits:
##       bucket2008    < 1.5  to the left,  agree=0.860, adj=0.646, (0 split)
##       ihd           < 0.5  to the left,  agree=0.790, adj=0.467, (0 split)
##       diabetes      < 0.5  to the left,  agree=0.784, adj=0.454, (0 split)
##       heart.failure < 0.5  to the left,  agree=0.764, adj=0.403, (0 split)
##       kidney        < 0.5  to the left,  agree=0.730, adj=0.316, (0 split)
## 
## Node number 2: 3025 observations
##   predicted class=B1  expected loss=0.1209917  P(node) =0.605
##     class counts:  2659   228    98    37     3
##    probabilities: 0.879 0.075 0.032 0.012 0.001 
## 
## Node number 3: 1975 observations,    complexity param=0.0486618
##   predicted class=B2  expected loss=0.6339241  P(node) =0.395
##     class counts:   697   723   349   180    26
##    probabilities: 0.353 0.366 0.177 0.091 0.013 
##   left son=6 (923 obs) right son=7 (1052 obs)
##   Primary splits:
##       reimbursement2008 < 3765 to the left,  improve=38.75805, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=30.80855, (0 missing)
##       kidney            < 0.5  to the left,  improve=28.52915, (0 missing)
##       ihd               < 0.5  to the left,  improve=27.61120, (0 missing)
##       diabetes          < 0.5  to the left,  improve=23.99424, (0 missing)
##   Surrogate splits:
##       bucket2008    < 1.5  to the left,  agree=0.887, adj=0.757, (0 split)
##       kidney        < 0.5  to the left,  agree=0.660, adj=0.272, (0 split)
##       heart.failure < 0.5  to the left,  agree=0.639, adj=0.228, (0 split)
##       copd          < 0.5  to the left,  agree=0.622, adj=0.191, (0 split)
##       ihd           < 0.5  to the left,  agree=0.607, adj=0.159, (0 split)
## 
## Node number 6: 923 observations,    complexity param=0.01094891
##   predicted class=B1  expected loss=0.5178765  P(node) =0.1846
##     class counts:   445   311   109    50     8
##    probabilities: 0.482 0.337 0.118 0.054 0.009 
##   left son=12 (754 obs) right son=13 (169 obs)
##   Primary splits:
##       kidney            < 0.5  to the left,  improve=7.902130, (0 missing)
##       diabetes          < 0.5  to the left,  improve=6.451528, (0 missing)
##       ihd               < 0.5  to the left,  improve=6.445818, (0 missing)
##       reimbursement2008 < 2545 to the left,  improve=5.585770, (0 missing)
##       arthritis         < 0.5  to the left,  improve=5.444392, (0 missing)
##   Surrogate splits:
##       reimbursement2008 < 3755 to the left,  agree=0.818, adj=0.006, (0 split)
## 
## Node number 7: 1052 observations
##   predicted class=B2  expected loss=0.608365  P(node) =0.2104
##     class counts:   252   412   240   130    18
##    probabilities: 0.240 0.392 0.228 0.124 0.017 
## 
## Node number 12: 754 observations
##   predicted class=B1  expected loss=0.4814324  P(node) =0.1508
##     class counts:   391   239    79    38     7
##    probabilities: 0.519 0.317 0.105 0.050 0.009 
## 
## Node number 13: 169 observations
##   predicted class=B2  expected loss=0.5739645  P(node) =0.0338
##     class counts:    54    72    30    12     1
##    probabilities: 0.320 0.426 0.178 0.071 0.006 
## 
## n= 5000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)  
##    2) reimbursement2008< 1545 3025  366 B1 (0.88 0.075 0.032 0.012 0.00099) *
##    3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)  
##      6) reimbursement2008< 3765 923  478 B1 (0.48 0.34 0.12 0.054 0.0087)  
##       12) kidney< 0.5 754  363 B1 (0.52 0.32 0.1 0.05 0.0093) *
##       13) kidney>=0.5 169   97 B2 (0.32 0.43 0.18 0.071 0.0059) *
##      7) reimbursement2008>=3765 1052  640 B2 (0.24 0.39 0.23 0.12 0.017) *
## [1] "User selected model: "
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7, 
##     cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2, 
##     surrogatestyle = 0, maxdepth = 30, xval = 0))
##   n= 5000 
## 
##          CP nsplit rel error
## 1 0.0486618      0 1.0000000
## 2 0.0200000      2 0.9026764
## 
## Variable importance
## reimbursement2008        bucket2008               ihd          diabetes 
##                31                20                14                13 
##     heart.failure            kidney 
##                12                10 
## 
## Node number 1: 5000 observations,    complexity param=0.0486618
##   predicted class=B1  expected loss=0.3288  P(node) =1
##     class counts:  3356   951   447   217    29
##    probabilities: 0.671 0.190 0.089 0.043 0.006 
##   left son=2 (3025 obs) right son=3 (1975 obs)
##   Primary splits:
##       reimbursement2008 < 1545 to the left,  improve=464.1939, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=375.5983, (0 missing)
##       diabetes          < 0.5  to the left,  improve=323.2704, (0 missing)
##       ihd               < 0.5  to the left,  improve=319.3596, (0 missing)
##       heart.failure     < 0.5  to the left,  improve=227.0499, (0 missing)
##   Surrogate splits:
##       bucket2008    < 1.5  to the left,  agree=0.860, adj=0.646, (0 split)
##       ihd           < 0.5  to the left,  agree=0.790, adj=0.467, (0 split)
##       diabetes      < 0.5  to the left,  agree=0.784, adj=0.454, (0 split)
##       heart.failure < 0.5  to the left,  agree=0.764, adj=0.403, (0 split)
##       kidney        < 0.5  to the left,  agree=0.730, adj=0.316, (0 split)
## 
## Node number 2: 3025 observations
##   predicted class=B1  expected loss=0.1209917  P(node) =0.605
##     class counts:  2659   228    98    37     3
##    probabilities: 0.879 0.075 0.032 0.012 0.001 
## 
## Node number 3: 1975 observations,    complexity param=0.0486618
##   predicted class=B2  expected loss=0.6339241  P(node) =0.395
##     class counts:   697   723   349   180    26
##    probabilities: 0.353 0.366 0.177 0.091 0.013 
##   left son=6 (923 obs) right son=7 (1052 obs)
##   Primary splits:
##       reimbursement2008 < 3765 to the left,  improve=38.75805, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=30.80855, (0 missing)
##       kidney            < 0.5  to the left,  improve=28.52915, (0 missing)
##       ihd               < 0.5  to the left,  improve=27.61120, (0 missing)
##       diabetes          < 0.5  to the left,  improve=23.99424, (0 missing)
##   Surrogate splits:
##       bucket2008    < 1.5  to the left,  agree=0.887, adj=0.757, (0 split)
##       kidney        < 0.5  to the left,  agree=0.660, adj=0.272, (0 split)
##       heart.failure < 0.5  to the left,  agree=0.639, adj=0.228, (0 split)
##       copd          < 0.5  to the left,  agree=0.622, adj=0.191, (0 split)
##       ihd           < 0.5  to the left,  agree=0.607, adj=0.159, (0 split)
## 
## Node number 6: 923 observations
##   predicted class=B1  expected loss=0.5178765  P(node) =0.1846
##     class counts:   445   311   109    50     8
##    probabilities: 0.482 0.337 0.118 0.054 0.009 
## 
## Node number 7: 1052 observations
##   predicted class=B2  expected loss=0.608365  P(node) =0.2104
##     class counts:   252   412   240   130    18
##    probabilities: 0.240 0.392 0.228 0.124 0.017 
## 
## n= 5000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)  
##   2) reimbursement2008< 1545 3025  366 B1 (0.88 0.075 0.032 0.012 0.00099) *
##   3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)  
##     6) reimbursement2008< 3765 923  478 B1 (0.48 0.34 0.12 0.054 0.0087) *
##     7) reimbursement2008>=3765 1052  640 B2 (0.24 0.39 0.23 0.12 0.017) *
## Warning: All NAs for inv.AIC.fit vs. auc.OOB scatter plot of glb_models_df
## Warning: All NAs for auc.OOB in auc.OOB vs. accuracy.fit scatter plot of
## glb_models_df

## Warning: Removed 2 rows containing missing values (geom_point).

replay.petrisim(pn=glb_analytics_pn, 
    replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs, 
        "model.selected")), flip_coord=TRUE)
## time trans    "bgn " "fit.data.training.all " "predict.data.new " "end " 
## 0.0000   multiple enabled transitions:  data.training.all data.new model.selected    firing:  data.training.all 
## 1.0000    1   2 1 0 0 
## 1.0000   multiple enabled transitions:  data.training.all data.new model.selected model.final data.training.all.prediction   firing:  data.new 
## 2.0000    2   1 1 1 0 
## 2.0000   multiple enabled transitions:  data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction   firing:  model.selected 
## 3.0000    3   0 2 1 0

glb_script_df <- rbind(glb_script_df, 
                   data.frame(chunk_label="fit.data.training.all", 
                              chunk_step_major=max(glb_script_df$chunk_step_major)+1, 
                              chunk_step_minor=0,
                              elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
##                    chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed8            run.models                5                0  13.903
## elapsed9 fit.data.training.all                6                0  35.325

Step 6: fit.data.training.all

print(mdl_feats_df <- myextract_mdl_feats( sel_mdl=glb_sel_mdl, 
                                           entity_df=glb_entity_df))
##                   importance                id fit.feat
## reimbursement2008 100.000000 reimbursement2008     TRUE
## bucket2008         80.804319        bucket2008     TRUE
## diabetes           69.045297          diabetes     TRUE
## ihd                68.986875               ihd     TRUE
## heart.failure      45.143466     heart.failure     TRUE
## kidney              5.672342            kidney     TRUE
## age                 0.000000               age     TRUE
## alzheimers          0.000000        alzheimers     TRUE
## arthritis           0.000000         arthritis     TRUE
## cancer              0.000000            cancer     TRUE
## copd                0.000000              copd     TRUE
## depression          0.000000        depression     TRUE
## osteoporosis        0.000000      osteoporosis     TRUE
## stroke              0.000000            stroke     TRUE
#     ret_lst <- myrun_mdl_fn(indep_vars_vctr=indep_vars_vctr,
#                             rsp_var=glb_rsp_var, 
#                             rsp_var_out=glb_rsp_var_out,
#                             fit_df=glb_entity_df, 
#                             OOB_df=glb_newent_df,
#                             method=method, 
#                             tune_models_df=glb_tune_models_df,
#                             n_cv_folds=glb_n_cv_folds,
#                             loss_mtrx=glb_loss_mtrx,
#                             summaryFunction=glb_loss_smmry,
#                             metric="loss.error",
#                             maximize=FALSE)
ret_lst <- myrun_mdl_fn(indep_vars_vctr=mdl_feats_df$id,
                        rsp_var=glb_rsp_var, 
                        rsp_var_out=glb_rsp_var_out, 
                        fit_df=glb_entity_df,
                        method=glb_sel_mdl$method,
                        tune_models_df=glb_tune_models_df,
                        n_cv_folds=2, # glb_n_cv_folds,
                        loss_mtrx=glb_loss_mtrx,
                        summaryFunction=glb_sel_mdl$control$summaryFunction,
                        metric=glb_sel_mdl$metric,
                        maximize=glb_sel_mdl$maximize)
##    B1 B2 B3 B4 B5
## B1  6  2  0  0  0
## B2  2  0  0  0  0
## B3  0  0  0  0  0
## B4  0  0  0  0  0
## B5  0  0  0  0  0
## + Fold1: cp=0 
##      B1  B2 B3 B4 B5
## B1 1458 170 38 12  0
## B2  221 170 57 27  0
## B3   96  79 34 15  0
## B4   43  37 15 14  0
## B5    5   3  4  2  0
##      B1  B2 B3 B4 B5
## B1 1504 174  0  0  0
## B2  214 261  0  0  0
## B3   88 136  0  0  0
## B4   44  65  0  0  0
## B5    4  10  0  0  0
##      B1  B2 B3 B4 B5
## B1 1423 255  0  0  0
## B2  172 303  0  0  0
## B3   71 153  0  0  0
## B4   37  72  0  0  0
## B5    4  10  0  0  0
##      B1  B2 B3 B4 B5
## B1 1423 255  0  0  0
## B2  172 303  0  0  0
## B3   71 153  0  0  0
## B4   37  72  0  0  0
## B5    4  10  0  0  0
##      B1  B2 B3 B4 B5
## B1 1423 255  0  0  0
## B2  172 303  0  0  0
## B3   71 153  0  0  0
## B4   37  72  0  0  0
## B5    4  10  0  0  0
## - Fold1: cp=0 
## + Fold2: cp=0 
##      B1  B2 B3 B4 B5
## B1 1483 146 35 14  0
## B2  257 154 51 14  0
## B3  104  92 20  7  0
## B4   39  45 16  8  0
## B5    6   7  2  0  0
##      B1  B2 B3 B4 B5
## B1 1573 105  0  0  0
## B2  306 170  0  0  0
## B3  108 115  0  0  0
## B4   43  65  0  0  0
## B5    6   9  0  0  0
##      B1  B2 B3 B4 B5
## B1 1595  83  0  0  0
## B2  350 126  0  0  0
## B3  132  91  0  0  0
## B4   49  59  0  0  0
## B5    8   7  0  0  0
##      B1  B2 B3 B4 B5
## B1 1595  83  0  0  0
## B2  350 126  0  0  0
## B3  132  91  0  0  0
## B4   49  59  0  0  0
## B5    8   7  0  0  0
##      B1  B2 B3 B4 B5
## B1 1595  83  0  0  0
## B2  350 126  0  0  0
## B3  132  91  0  0  0
## B4   49  59  0  0  0
## B5    8   7  0  0  0
## - Fold2: cp=0 
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.01 on full training set

## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7, 
##     cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2, 
##     surrogatestyle = 0, maxdepth = 30, xval = 0))
##   n= 5000 
## 
##           CP nsplit rel error
## 1 0.04866180      0 1.0000000
## 2 0.01094891      2 0.9026764
## 3 0.01000000      3 0.8917275
## 
## Variable importance
## reimbursement2008        bucket2008               ihd          diabetes 
##                31                20                14                13 
##     heart.failure            kidney 
##                12                10 
## 
## Node number 1: 5000 observations,    complexity param=0.0486618
##   predicted class=B1  expected loss=0.3288  P(node) =1
##     class counts:  3356   951   447   217    29
##    probabilities: 0.671 0.190 0.089 0.043 0.006 
##   left son=2 (3025 obs) right son=3 (1975 obs)
##   Primary splits:
##       reimbursement2008 < 1545 to the left,  improve=464.1939, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=375.5983, (0 missing)
##       diabetes          < 0.5  to the left,  improve=323.2704, (0 missing)
##       ihd               < 0.5  to the left,  improve=319.3596, (0 missing)
##       heart.failure     < 0.5  to the left,  improve=227.0499, (0 missing)
##   Surrogate splits:
##       bucket2008    < 1.5  to the left,  agree=0.860, adj=0.646, (0 split)
##       ihd           < 0.5  to the left,  agree=0.790, adj=0.467, (0 split)
##       diabetes      < 0.5  to the left,  agree=0.784, adj=0.454, (0 split)
##       heart.failure < 0.5  to the left,  agree=0.764, adj=0.403, (0 split)
##       kidney        < 0.5  to the left,  agree=0.730, adj=0.316, (0 split)
## 
## Node number 2: 3025 observations
##   predicted class=B1  expected loss=0.1209917  P(node) =0.605
##     class counts:  2659   228    98    37     3
##    probabilities: 0.879 0.075 0.032 0.012 0.001 
## 
## Node number 3: 1975 observations,    complexity param=0.0486618
##   predicted class=B2  expected loss=0.6339241  P(node) =0.395
##     class counts:   697   723   349   180    26
##    probabilities: 0.353 0.366 0.177 0.091 0.013 
##   left son=6 (923 obs) right son=7 (1052 obs)
##   Primary splits:
##       reimbursement2008 < 3765 to the left,  improve=38.75805, (0 missing)
##       bucket2008        < 1.5  to the left,  improve=30.80855, (0 missing)
##       kidney            < 0.5  to the left,  improve=28.52915, (0 missing)
##       ihd               < 0.5  to the left,  improve=27.61120, (0 missing)
##       diabetes          < 0.5  to the left,  improve=23.99424, (0 missing)
##   Surrogate splits:
##       bucket2008    < 1.5  to the left,  agree=0.887, adj=0.757, (0 split)
##       kidney        < 0.5  to the left,  agree=0.660, adj=0.272, (0 split)
##       heart.failure < 0.5  to the left,  agree=0.639, adj=0.228, (0 split)
##       copd          < 0.5  to the left,  agree=0.622, adj=0.191, (0 split)
##       ihd           < 0.5  to the left,  agree=0.607, adj=0.159, (0 split)
## 
## Node number 6: 923 observations,    complexity param=0.01094891
##   predicted class=B1  expected loss=0.5178765  P(node) =0.1846
##     class counts:   445   311   109    50     8
##    probabilities: 0.482 0.337 0.118 0.054 0.009 
##   left son=12 (754 obs) right son=13 (169 obs)
##   Primary splits:
##       kidney            < 0.5  to the left,  improve=7.902130, (0 missing)
##       diabetes          < 0.5  to the left,  improve=6.451528, (0 missing)
##       ihd               < 0.5  to the left,  improve=6.445818, (0 missing)
##       reimbursement2008 < 2545 to the left,  improve=5.585770, (0 missing)
##       arthritis         < 0.5  to the left,  improve=5.444392, (0 missing)
##   Surrogate splits:
##       reimbursement2008 < 3755 to the left,  agree=0.818, adj=0.006, (0 split)
## 
## Node number 7: 1052 observations
##   predicted class=B2  expected loss=0.608365  P(node) =0.2104
##     class counts:   252   412   240   130    18
##    probabilities: 0.240 0.392 0.228 0.124 0.017 
## 
## Node number 12: 754 observations
##   predicted class=B1  expected loss=0.4814324  P(node) =0.1508
##     class counts:   391   239    79    38     7
##    probabilities: 0.519 0.317 0.105 0.050 0.009 
## 
## Node number 13: 169 observations
##   predicted class=B2  expected loss=0.5739645  P(node) =0.0338
##     class counts:    54    72    30    12     1
##    probabilities: 0.320 0.426 0.178 0.071 0.006 
## 
## n= 5000 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 5000 1644 B1 (0.67 0.19 0.089 0.043 0.0058)  
##    2) reimbursement2008< 1545 3025  366 B1 (0.88 0.075 0.032 0.012 0.00099) *
##    3) reimbursement2008>=1545 1975 1252 B2 (0.35 0.37 0.18 0.091 0.013)  
##      6) reimbursement2008< 3765 923  478 B1 (0.48 0.34 0.12 0.054 0.0087)  
##       12) kidney< 0.5 754  363 B1 (0.52 0.32 0.1 0.05 0.0093) *
##       13) kidney>=0.5 169   97 B2 (0.32 0.43 0.18 0.071 0.0059) *
##      7) reimbursement2008>=3765 1052  640 B2 (0.24 0.39 0.23 0.12 0.017) *
glb_fin_mdl <- ret_lst[["model"]]; 

glb_script_df <- rbind(glb_script_df, 
                   data.frame(chunk_label="fit.data.training.all", 
    chunk_step_major=glb_script_df[nrow(glb_script_df), "chunk_step_major"], 
    chunk_step_minor=glb_script_df[nrow(glb_script_df), "chunk_step_minor"]+1,
                              elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
##                     chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed9  fit.data.training.all                6                0  35.325
## elapsed10 fit.data.training.all                6                1  40.509
if (glb_is_regression) {
    glb_entity_df[, glb_rsp_var_out] <- predict(glb_fin_mdl, newdata=glb_entity_df)
    print(myplot_scatter(glb_entity_df, glb_rsp_var, glb_rsp_var_out, 
                         smooth=TRUE))
    glb_entity_df[, paste0(glb_rsp_var_out, ".err")] <- 
        abs(glb_entity_df[, glb_rsp_var_out] - glb_entity_df[, glb_rsp_var])
    print(head(orderBy(reformulate(c("-", paste0(glb_rsp_var_out, ".err"))), 
                       glb_entity_df)))                             
}    

if (glb_is_classification & glb_is_binomial) {
            if (any(class(glb_fin_mdl) %in% c("train"))) {
        glb_entity_df[, paste0(glb_rsp_var_out, ".proba")] <- 
            predict(glb_fin_mdl, newdata=glb_entity_df, type="prob")[, 2]
    } else  if (any(class(glb_fin_mdl) %in% c("rpart", "randomForest"))) {
        glb_entity_df[, paste0(glb_rsp_var_out, ".proba")] <- 
            predict(glb_fin_mdl, newdata=glb_entity_df, type="prob")[, 2]
    } else  if (class(glb_fin_mdl) == "glm") {
        stop("not implemented yet")
        glb_entity_df[, paste0(glb_rsp_var_out, ".proba")] <- 
            predict(glb_fin_mdl, newdata=glb_entity_df, type="response")
    } else  stop("not implemented yet")   

    require(ROCR)
    ROCRpred <- prediction(glb_entity_df[, paste0(glb_rsp_var_out, ".proba")],
                           glb_entity_df[, glb_rsp_var])
    ROCRperf <- performance(ROCRpred, "tpr", "fpr")
    plot(ROCRperf, colorize=TRUE, print.cutoffs.at=seq(0, 1, 0.1), text.adj=c(-0.2,1.7))
    
    thresholds_df <- data.frame(threshold=seq(0.0, 1.0, 0.1))
    thresholds_df$f.score <- sapply(1:nrow(thresholds_df), function(row_ix) 
        mycompute_classifier_f.score(mdl=glb_fin_mdl, obs_df=glb_entity_df, 
                                     proba_threshold=thresholds_df[row_ix, "threshold"], 
                                      rsp_var=glb_rsp_var, 
                                      rsp_var_out=glb_rsp_var_out))
    print(thresholds_df)
    print(myplot_line(thresholds_df, "threshold", "f.score"))
    
    proba_threshold <- thresholds_df[which.max(thresholds_df$f.score), 
                                             "threshold"]
    # This should change to maximize f.score.OOB ???
    print(sprintf("Classifier Probability Threshold: %0.4f to maximize f.score.fit",
                  proba_threshold))
    if (is.null(glb_clf_proba_threshold)) 
        glb_clf_proba_threshold <- proba_threshold else {
        print(sprintf("Classifier Probability Threshold: %0.4f per user specs",
                      glb_clf_proba_threshold))
    }

    if ((class(glb_entity_df[, glb_rsp_var]) != "factor") | 
        (length(levels(glb_entity_df[, glb_rsp_var])) != 2))
        stop("expecting a factor with two levels:", glb_rsp_var)
    glb_entity_df[, glb_rsp_var_out] <- 
        factor(levels(glb_entity_df[, glb_rsp_var])[
            (glb_entity_df[, paste0(glb_rsp_var_out, ".proba")] >= 
                glb_clf_proba_threshold) * 1 + 1])
             
    print(mycreate_xtab(glb_entity_df, c(glb_rsp_var, glb_rsp_var_out)))
    print(sprintf("f.score=%0.4f", 
        mycompute_classifier_f.score(glb_fin_mdl, glb_entity_df, 
                                     glb_clf_proba_threshold, 
                                     glb_rsp_var, glb_rsp_var_out)))    
}

if (glb_is_classification & !glb_is_binomial) {
    glb_entity_df[, glb_rsp_var_out] <- predict(glb_fin_mdl, newdata=glb_entity_df, type="raw")
}    

print(glb_feats_df <- mymerge_feats_importance(glb_feats_df, glb_fin_mdl, glb_entity_df))
##                   id      cor.y  cor.y.abs cor.low importance
## 13 reimbursement2008 0.40427780 0.40427780      NA 100.000000
## 4         bucket2008 0.46125446 0.46125446       1  79.916766
## 8           diabetes 0.40896269 0.40896269       1  69.555548
## 10               ihd 0.40798592 0.40798592       1  69.496645
## 9      heart.failure 0.37320420 0.37320420       1  44.647610
## 11            kidney 0.38971036 0.38971036       1   7.163930
## 3          arthritis 0.25224332 0.25224332       1   1.070597
## 1                age 0.02529723 0.02529723       1   0.000000
## 2         alzheimers 0.27882540 0.27882540       1   0.000000
## 5             cancer 0.19499896 0.19499896       1   0.000000
## 6               copd 0.31969314 0.31969314       1   0.000000
## 7         depression 0.28668189 0.28668189       1   0.000000
## 12      osteoporosis 0.21770017 0.21770017       1   0.000000
## 14            stroke 0.16603039 0.16603039       1   0.000000
# Most of this code is used again in predict.data.new chunk
glb_analytics_diag_plots <- function(obs_df) {
    for (var in subset(glb_feats_df, !is.na(importance))$id) {
        plot_df <- melt(obs_df, id.vars=var, 
                        measure.vars=c(glb_rsp_var, glb_rsp_var_out))
#         if (var == "<feat_name>") print(myplot_scatter(plot_df, var, "value", 
#                                              facet_colcol_name="variable") + 
#                       geom_vline(xintercept=<divider_val>, linetype="dotted")) else     
            print(myplot_scatter(plot_df, var, "value", colorcol_name="variable",
                                 facet_colcol_name="variable", jitter=TRUE) + 
                      guides(color=FALSE))
    }
    
    if (glb_is_regression) {
        plot_vars_df <- subset(glb_feats_df, Pr.z < 0.1)
        print(myplot_prediction_regression(obs_df, 
                    ifelse(nrow(plot_vars_df) > 1, plot_vars_df$id[2], ".rownames"), 
                                           plot_vars_df$id[1],
                    glb_rsp_var, glb_rsp_var_out)
#               + facet_wrap(reformulate(plot_vars_df$id[2])) # if [1,2] is a factor                                                         
#               + geom_point(aes_string(color="<col_name>.fctr")) #  to color the plot
              )
    }    
    
    if (glb_is_classification) {
        if (nrow(plot_vars_df <- subset(glb_feats_df, !is.na(importance))) == 0)
            warning("No features in selected model are statistically important")
        else print(myplot_prediction_classification(df=obs_df, 
                feat_x=ifelse(nrow(plot_vars_df) > 1, plot_vars_df$id[2], 
                              ".rownames"),
                                               feat_y=plot_vars_df$id[1],
                     rsp_var=glb_rsp_var, 
                     rsp_var_out=glb_rsp_var_out, 
                     id_vars=glb_id_vars)
#               + geom_hline(yintercept=<divider_val>, linetype = "dotted")
                )
    }    
}
glb_analytics_diag_plots(obs_df=glb_entity_df)

##        age alzheimers arthritis cancer copd depression diabetes
## 199     73          0         0      0    0          0        0
## 153558  68          0         0      0    0          0        0
## 154439  70          0         0      0    1          0        1
## 162273  59          0         0      0    0          0        1
## 179045  74          1         0      0    0          0        1
## 190763  57          1         0      0    0          0        0
## 190868  67          1         0      0    0          0        0
## 192357  72          0         0      0    0          0        1
## 214932  80          1         0      1    0          1        1
## 262260  79          1         1      1    0          0        1
## 300375  88          1         0      0    0          0        1
## 307445  74          0         0      0    0          0        0
## 312790  75          0         0      0    0          0        0
## 313477  82          0         0      0    0          0        0
## 321503  86          0         0      0    0          0        0
## 338073  78          0         0      0    0          0        0
## 349461  74          0         0      0    0          0        0
## 398550  56          1         0      0    0          0        1
## 402417  53          1         0      0    0          1        1
## 456020  68          1         1      0    1          1        1
##        heart.failure ihd kidney osteoporosis stroke reimbursement2008
## 199                0   0      0            0      0                 0
## 153558             0   0      1            0      0              1790
## 154439             0   1      1            0      0             58230
## 162273             1   1      1            0      0              1820
## 179045             1   1      1            0      1              2360
## 190763             1   0      1            1      0              1580
## 190868             1   0      1            0      0              2590
## 192357             1   1      1            0      0              1640
## 214932             1   1      1            0      0             71660
## 262260             0   1      1            0      1            112980
## 300375             1   1      1            0      1             68350
## 307445             0   0      0            0      0                 0
## 312790             0   0      0            0      0                 0
## 313477             0   0      0            0      0                 0
## 321503             0   0      0            0      0                 0
## 338073             0   0      0            0      0                 0
## 349461             0   0      0            0      0                 0
## 398550             1   1      1            1      1             64830
## 402417             1   1      0            0      0             65590
## 456020             1   1      1            1      0            133330
##        bucket2008 reimbursement2009 bucket2009      .rnorm bucket2009.fctr
## 199             1                 0          1  0.09472856              B1
## 153558          1               560          1  0.74713704              B1
## 154439          5               570          1  0.11459397              B1
## 162273          1               670          1  2.30203690              B1
## 179045          1               880          1 -0.45292024              B1
## 190763          1              1030          1 -0.98121958              B1
## 190868          1              1030          1  0.60288110              B1
## 192357          1              1050          1  0.70214341              B1
## 214932          5              1340          1 -1.76578293              B1
## 262260          5              2040          1  0.62732336              B1
## 300375          5              2810          1 -0.19715402              B1
## 307445          1              3000          2  0.55120317              B2
## 312790          1              3150          2 -0.37917847              B2
## 313477          1              3170          2 -0.15857725              B2
## 321503          1              3410          2  0.43252159              B2
## 338073          1              4000          2 -0.81670317              B2
## 349461          1              4490          2  0.80808499              B2
## 398550          5              8570          3 -1.16511891              B3
## 402417          5              9200          3 -2.13025718              B3
## 456020          5             59270          5 -2.06323014              B5
##        bucket2009.fctr.prediction bucket2009.fctr.prediction.accurate
## 199                            B1                                TRUE
## 153558                         B2                               FALSE
## 154439                         B2                               FALSE
## 162273                         B2                               FALSE
## 179045                         B2                               FALSE
## 190763                         B2                               FALSE
## 190868                         B2                               FALSE
## 192357                         B2                               FALSE
## 214932                         B2                               FALSE
## 262260                         B2                               FALSE
## 300375                         B2                               FALSE
## 307445                         B1                               FALSE
## 312790                         B1                               FALSE
## 313477                         B1                               FALSE
## 321503                         B1                               FALSE
## 338073                         B1                               FALSE
## 349461                         B1                               FALSE
## 398550                         B2                               FALSE
## 402417                         B2                               FALSE
## 456020                         B2                               FALSE
##         .label
## 199       .199
## 153558 .153558
## 154439 .154439
## 162273 .162273
## 179045 .179045
## 190763 .190763
## 190868 .190868
## 192357 .192357
## 214932 .214932
## 262260 .262260
## 300375 .300375
## 307445 .307445
## 312790 .312790
## 313477 .313477
## 321503 .321503
## 338073 .338073
## 349461 .349461
## 398550 .398550
## 402417 .402417
## 456020 .456020

replay.petrisim(pn=glb_analytics_pn, 
    replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs, 
        "data.training.all.prediction","model.final")), flip_coord=TRUE)
## time trans    "bgn " "fit.data.training.all " "predict.data.new " "end " 
## 0.0000   multiple enabled transitions:  data.training.all data.new model.selected    firing:  data.training.all 
## 1.0000    1   2 1 0 0 
## 1.0000   multiple enabled transitions:  data.training.all data.new model.selected model.final data.training.all.prediction   firing:  data.new 
## 2.0000    2   1 1 1 0 
## 2.0000   multiple enabled transitions:  data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction   firing:  model.selected 
## 3.0000    3   0 2 1 0 
## 3.0000   multiple enabled transitions:  model.final data.training.all.prediction data.new.prediction     firing:  data.training.all.prediction 
## 4.0000    5   0 1 1 1 
## 4.0000   multiple enabled transitions:  model.final data.training.all.prediction data.new.prediction     firing:  model.final 
## 5.0000    4   0 0 2 1

glb_script_df <- rbind(glb_script_df, 
                   data.frame(chunk_label="predict.data.new", 
                              chunk_step_major=max(glb_script_df$chunk_step_major)+1, 
                              chunk_step_minor=0,
                              elapsed=(proc.time() - glb_script_tm)["elapsed"]))
print(tail(glb_script_df, 2))
##                     chunk_label chunk_step_major chunk_step_minor elapsed
## elapsed10 fit.data.training.all                6                1  40.509
## elapsed11      predict.data.new                7                0  58.094

Step 7: predict data.new

if (glb_is_regression)
    glb_newent_df[, glb_rsp_var_out] <- predict(glb_fin_mdl, 
                                        newdata=glb_newent_df, type="response")

if (glb_is_classification & glb_is_binomial) {
    # Compute selected model predictions
            if (any(class(glb_fin_mdl) %in% c("train"))) {
        glb_newent_df[, paste0(glb_rsp_var_out, ".proba")] <- 
            predict(glb_fin_mdl, newdata=glb_newent_df, type="prob")[, 2]
    } else  if (any(class(glb_fin_mdl) %in% c("rpart", "randomForest"))) {
        glb_newent_df[, paste0(glb_rsp_var_out, ".proba")] <- 
            predict(glb_fin_mdl, newdata=glb_newent_df, type="prob")[, 2]
    } else  if (class(glb_fin_mdl) == "glm") {
        stop("not implemented yet")
        glb_newent_df[, paste0(glb_rsp_var_out, ".proba")] <- 
            predict(glb_fin_mdl, newdata=glb_newent_df, type="response")
    } else  stop("not implemented yet")   

    if ((class(glb_newent_df[, glb_rsp_var]) != "factor") | 
        (length(levels(glb_newent_df[, glb_rsp_var])) != 2))
        stop("expecting a factor with two levels:", glb_rsp_var)
    glb_newent_df[, glb_rsp_var_out] <- 
        factor(levels(glb_newent_df[, glb_rsp_var])[
            (glb_newent_df[, paste0(glb_rsp_var_out, ".proba")] >= 
                glb_clf_proba_threshold) * 1 + 1])

    # Compute dummy model predictions
    glb_newent_df[, paste0(glb_rsp_var, ".predictdmy.proba")] <- 
        predict(glb_dmy_mdl, newdata=glb_newent_df, type="prob")[, 2]
    if ((class(glb_newent_df[, glb_rsp_var]) != "factor") | 
        (length(levels(glb_newent_df[, glb_rsp_var])) != 2))
        stop("expecting a factor with two levels:", glb_rsp_var)
    glb_newent_df[, paste0(glb_rsp_var, ".predictdmy")] <- 
        factor(levels(glb_newent_df[, glb_rsp_var])[
            (glb_newent_df[, paste0(glb_rsp_var, ".predictdmy.proba")] >= 
                glb_clf_proba_threshold) * 1 + 1])
}

if (glb_is_classification & !glb_is_binomial) {
    # Compute baseline predictions
    if (!is.null(glb_bsl_mdl_var)) {
        if (is.null(glb_map_rsp_raw_to_var)) {
            glb_newent_df[, paste0(glb_rsp_var, ".predictbsl")] <- 
                       glb_newent_df[, glb_bsl_mdl_var]
        } else {
            glb_newent_df[, paste0(glb_rsp_var, ".predictbsl")] <- 
                       glb_map_rsp_raw_to_var(glb_newent_df[, glb_bsl_mdl_var])
        }
    }    

    # Compute most frequent outcome predictions
    glb_newent_df[, paste0(glb_rsp_var, ".predictmfo")] <- 
        as.factor(names(sort(table(glb_entity_df[, glb_rsp_var]), decreasing=TRUE))[1])

    # Compute dummy model predictions - different from most frequent outcome - shd be from runif
    glb_newent_df[, paste0(glb_rsp_var, ".predictdmy")] <- 
        predict(glb_dmy_mdl, newdata=glb_newent_df, type="raw")

    # Compute selected_no_loss_matrix model predictions
    glb_newent_df[, paste0(glb_rsp_var, ".predictnlm")] <- 
        predict(glb_sel_nlm_mdl, newdata=glb_newent_df, type="raw")

    # Compute selected_with_loss_matrix model predictions
    glb_newent_df[, paste0(glb_rsp_var, ".predictwlm")] <- 
        predict(glb_sel_wlm_mdl, newdata=glb_newent_df, type="raw")

    # Compute selected_with_t_loss_matrix model predictions
    glb_newent_df[, paste0(glb_rsp_var, ".predictsel")] <- 
        predict(glb_sel_mdl, newdata=glb_newent_df, type="raw")

    # Compute final model predictions
    glb_newent_df[, glb_rsp_var_out] <- predict(glb_fin_mdl, newdata=glb_newent_df, type="raw")
}
    
myprint_df(glb_newent_df[, c(glb_id_vars, glb_rsp_var, glb_rsp_var_out)])
##     bucket2009.fctr bucket2009.fctr.prediction
## 6                B1                         B1
## 26               B1                         B1
## 95               B1                         B1
## 121              B1                         B1
## 197              B1                         B1
## 286              B1                         B1
##        bucket2009.fctr bucket2009.fctr.prediction
## 141700              B1                         B1
## 240147              B1                         B1
## 254198              B1                         B2
## 272537              B1                         B1
## 429148              B3                         B1
## 451493              B4                         B2
##        bucket2009.fctr bucket2009.fctr.prediction
## 457490              B5                         B2
## 457576              B5                         B2
## 457604              B5                         B2
## 457705              B5                         B2
## 457824              B5                         B2
## 457912              B5                         B2
if (glb_is_regression) {
    print(sprintf("Total SSE: %0.4f", 
                  sum((glb_newent_df[, glb_rsp_var_out] - 
                        glb_newent_df[, glb_rsp_var]) ^ 2)))
    print(sprintf("RMSE: %0.4f", 
                  (sum((glb_newent_df[, glb_rsp_var_out] - 
                        glb_newent_df[, glb_rsp_var]) ^ 2) / nrow(glb_newent_df)) ^ 0.5))                        
    print(myplot_scatter(glb_newent_df, glb_rsp_var, glb_rsp_var_out, 
                         smooth=TRUE))
                         
    glb_newent_df[, paste0(glb_rsp_var_out, ".err")] <- 
        abs(glb_newent_df[, glb_rsp_var_out] - glb_newent_df[, glb_rsp_var])
    print(head(orderBy(reformulate(c("-", paste0(glb_rsp_var_out, ".err"))), 
                       glb_newent_df)))                                                      

#     glb_newent_df[, "<Output Pred variable>"] <- func(glb_newent_df[, glb_pred_var_name])                         
}                         

if (glb_is_classification & glb_is_binomial) {
    ROCRpred <- prediction(glb_newent_df[, paste0(glb_rsp_var_out, ".proba")],
                           glb_newent_df[, glb_rsp_var])
    print(sprintf("auc=%0.4f", auc <- as.numeric(performance(ROCRpred, "auc")@y.values)))   
    
    print(sprintf("probability threshold=%0.4f", glb_clf_proba_threshold))
    print(newent_conf_df <- mycreate_xtab(glb_newent_df, 
                                        c(glb_rsp_var, glb_rsp_var_out)))
    print(sprintf("f.score.sel=%0.4f", 
        mycompute_classifier_f.score(mdl=glb_fin_mdl, obs_df=glb_newent_df, 
                                     proba_threshold=glb_clf_proba_threshold, 
                                      rsp_var=glb_rsp_var, 
                                      rsp_var_out=glb_rsp_var_out)))
    print(sprintf("sensitivity=%0.4f", newent_conf_df[2, 3] / 
                      (newent_conf_df[2, 3] + newent_conf_df[2, 2])))
    print(sprintf("specificity=%0.4f", newent_conf_df[1, 2] / 
                      (newent_conf_df[1, 2] + newent_conf_df[1, 3])))
    print(sprintf("accuracy=%0.4f", (newent_conf_df[1, 2] + newent_conf_df[2, 3]) / 
                      (newent_conf_df[1, 2] + newent_conf_df[2, 3] + 
                       newent_conf_df[1, 3] + newent_conf_df[2, 2])))
    
    print(mycreate_xtab(glb_newent_df, c(glb_rsp_var, paste0(glb_rsp_var, ".predictdmy"))))
    print(sprintf("f.score.dmy=%0.4f", 
        mycompute_classifier_f.score(mdl=glb_dmy_mdl, obs_df=glb_newent_df, 
                                     proba_threshold=glb_clf_proba_threshold, 
                                      rsp_var=glb_rsp_var, 
                                      rsp_var_out=paste0(glb_rsp_var, ".predictdmy"))))
}    

if (glb_is_classification & !glb_is_binomial) {
    mycompute_prediction_quality <- function(mdl_type) {    
        print(sprintf("prediction model type: %s", mdl_type))
        print(newent_conf_df <- mycreate_xtab(glb_newent_df, 
            c(glb_rsp_var, 
              ifelse(mdl_type == "fin", glb_rsp_var_out, 
                     paste0(glb_rsp_var, ".predict", mdl_type)))))
        newent_conf_df[is.na(newent_conf_df)] <- 0    
        newent_conf_mtrx <- as.matrix(newent_conf_df[, -1])
        newent_conf_mtrx <- cbind(newent_conf_mtrx, 
            matrix(rep(0, 
                (nrow(newent_conf_mtrx) - ncol(newent_conf_mtrx)) * nrow(newent_conf_mtrx)), 
                    byrow=TRUE, nrow=nrow(newent_conf_mtrx)))
        return(data.frame(mdl_type=mdl_type, 
            accuracy=sum(diag(newent_conf_mtrx)) * 1.0 / sum(newent_conf_mtrx),    
            loss.error=sum(newent_conf_mtrx * glb_loss_mtrx) / nrow(glb_newent_df)))
    }

    predict_qlty_df <- data.frame()
    for (type in c("bsl", "mfo", "dmy", "nlm", "wlm", "sel", "fin"))
        predict_qlty_df <- rbind(predict_qlty_df, mycompute_prediction_quality(type))
    print(predict_qlty_df)
    
    predict_qlty_df$inv.loss.error <- 1.0 / predict_qlty_df$loss.error
    print(myplot_scatter(predict_qlty_df, "accuracy", "inv.loss.error", 
                         colorcol_name="mdl_type"))
}    
## [1] "prediction model type: bsl"
##   bucket2009.fctr bucket2009.fctr.predictbsl.B1
## 1              B1                          2993
## 2              B2                           430
## 3              B3                           198
## 4              B4                            78
## 5              B5                            10
##   bucket2009.fctr.predictbsl.B2 bucket2009.fctr.predictbsl.B3
## 1                           216                           102
## 2                           292                           124
## 3                           126                            68
## 4                            51                            39
## 5                             5                             5
##   bucket2009.fctr.predictbsl.B4 bucket2009.fctr.predictbsl.B5
## 1                            39                             6
## 2                            88                            17
## 3                            48                             7
## 4                            39                            10
## 5                             6                             3
## [1] "prediction model type: mfo"
##   bucket2009.fctr bucket2009.fctr.predictmfo.B1
## 1              B1                          3356
## 2              B2                           951
## 3              B3                           447
## 4              B4                           217
## 5              B5                            29
## [1] "prediction model type: dmy"
##   bucket2009.fctr bucket2009.fctr.predictdmy.B1
## 1              B1                          3356
## 2              B2                           951
## 3              B3                           447
## 4              B4                           217
## 5              B5                            29
## [1] "prediction model type: nlm"
##   bucket2009.fctr bucket2009.fctr.predictnlm.B1
## 1              B1                          3011
## 2              B2                           455
## 3              B3                           195
## 4              B4                            80
## 5              B5                            11
##   bucket2009.fctr.predictnlm.B2
## 1                           345
## 2                           496
## 3                           252
## 4                           137
## 5                            18
## [1] "prediction model type: wlm"
##   bucket2009.fctr bucket2009.fctr.predictwlm.B1
## 1              B1                          3011
## 2              B2                           455
## 3              B3                           195
## 4              B4                            80
## 5              B5                            11
##   bucket2009.fctr.predictwlm.B2
## 1                           345
## 2                           496
## 3                           252
## 4                           137
## 5                            18
## [1] "prediction model type: sel"
##   bucket2009.fctr bucket2009.fctr.predictsel.B1
## 1              B1                          3076
## 2              B2                           524
## 3              B3                           230
## 4              B4                            89
## 5              B5                            12
##   bucket2009.fctr.predictsel.B2
## 1                           280
## 2                           427
## 3                           217
## 4                           128
## 5                            17
## [1] "prediction model type: fin"
##   bucket2009.fctr bucket2009.fctr.prediction.B1
## 1              B1                          3011
## 2              B2                           455
## 3              B3                           195
## 4              B4                            80
## 5              B5                            11
##   bucket2009.fctr.prediction.B2
## 1                           345
## 2                           496
## 3                           252
## 4                           137
## 5                            18
##   mdl_type accuracy loss.error
## 1      bsl   0.6790     0.7560
## 2      mfo   0.6712     1.0448
## 3      dmy   0.6712     1.0448
## 4      nlm   0.7014     0.7526
## 5      wlm   0.7014     0.7526
## 6      sel   0.7006     0.7852
## 7      fin   0.7014     0.7526

glb_analytics_diag_plots(obs_df=glb_newent_df)

##        age alzheimers arthritis cancer copd depression diabetes
## 6       68          0         0      0    0          0        0
## 124097  76          0         0      0    0          0        0
## 129688  73          0         1      0    0          0        1
## 159234  90          0         0      0    1          0        1
## 169528  74          0         0      0    0          0        1
## 174385  82          0         1      1    0          0        1
## 182164  67          0         0      0    0          0        0
## 183017  67          0         1      0    0          0        0
## 183925  82          1         0      0    1          1        1
## 191662  80          0         0      0    0          0        1
## 225906  67          1         1      0    1          1        1
## 276769  85          0         0      1    1          1        0
## 288549  89          1         0      1    0          0        1
## 307826  53          0         0      0    0          0        0
## 309958  97          0         0      0    0          0        0
## 312065  68          0         0      0    0          0        0
## 315618  53          0         0      0    0          0        0
## 322157  45          0         0      0    0          0        0
## 335790  72          0         0      0    0          0        0
## 380565  73          1         1      1    1          0        1
##        heart.failure ihd kidney osteoporosis stroke reimbursement2008
## 6                  0   0      0            0      0                 0
## 124097             1   0      1            0      0              1960
## 129688             0   1      1            0      0              2920
## 159234             1   1      1            0      0             73860
## 169528             0   1      1            0      0              2350
## 174385             1   1      1            1      0             73650
## 182164             0   1      1            0      0              1690
## 183017             0   1      1            0      0              2480
## 183925             1   1      1            0      0             84040
## 191662             1   1      1            1      0              2700
## 225906             1   1      1            1      1             98110
## 276769             1   1      1            0      0             86040
## 288549             1   1      1            0      0             64380
## 307826             0   0      0            0      0                 0
## 309958             0   0      0            0      0                 0
## 312065             0   0      0            0      0                 0
## 315618             0   0      0            0      0                 0
## 322157             0   0      0            0      0                 0
## 335790             0   0      0            0      0                 0
## 380565             1   1      1            0      1            173320
##        bucket2008 reimbursement2009 bucket2009     .rnorm bucket2009.fctr
## 6               1                 0          1 -0.2112782              B1
## 124097          1               200          1  1.1792599              B1
## 129688          1               260          1  0.4691106              B1
## 159234          5               630          1  0.3867920              B1
## 169528          1               760          1  0.7510352              B1
## 174385          5               820          1 -0.9283519              B1
## 182164          1               920          1  0.4947145              B1
## 183017          1               930          1 -0.3375198              B1
## 183925          5               940          1  0.1999707              B1
## 191662          1              1040          1  1.3930878              B1
## 225906          5              1490          1 -1.1052827              B1
## 276769          5              2300          1  0.7084125              B1
## 288549          5              2540          1  1.5000372              B1
## 307826          1              3010          2  0.4305946              B2
## 309958          1              3070          2  1.6863455              B2
## 312065          1              3130          2 -0.6355543              B2
## 315618          1              3230          2 -0.7178728              B2
## 322157          1              3430          2  0.3348709              B2
## 335790          1              3910          2 -0.0957258              B2
## 380565          5              6470          2 -0.7323969              B2
##        bucket2009.fctr.predictbsl bucket2009.fctr.predictmfo
## 6                              B1                         B1
## 124097                         B1                         B1
## 129688                         B1                         B1
## 159234                         B5                         B1
## 169528                         B1                         B1
## 174385                         B5                         B1
## 182164                         B1                         B1
## 183017                         B1                         B1
## 183925                         B5                         B1
## 191662                         B1                         B1
## 225906                         B5                         B1
## 276769                         B5                         B1
## 288549                         B5                         B1
## 307826                         B1                         B1
## 309958                         B1                         B1
## 312065                         B1                         B1
## 315618                         B1                         B1
## 322157                         B1                         B1
## 335790                         B1                         B1
## 380565                         B5                         B1
##        bucket2009.fctr.predictdmy bucket2009.fctr.predictnlm
## 6                              B1                         B1
## 124097                         B1                         B2
## 129688                         B1                         B2
## 159234                         B1                         B2
## 169528                         B1                         B2
## 174385                         B1                         B2
## 182164                         B1                         B2
## 183017                         B1                         B2
## 183925                         B1                         B2
## 191662                         B1                         B2
## 225906                         B1                         B2
## 276769                         B1                         B2
## 288549                         B1                         B2
## 307826                         B1                         B1
## 309958                         B1                         B1
## 312065                         B1                         B1
## 315618                         B1                         B1
## 322157                         B1                         B1
## 335790                         B1                         B1
## 380565                         B1                         B2
##        bucket2009.fctr.predictwlm bucket2009.fctr.predictsel
## 6                              B1                         B1
## 124097                         B2                         B1
## 129688                         B2                         B1
## 159234                         B2                         B2
## 169528                         B2                         B1
## 174385                         B2                         B2
## 182164                         B2                         B1
## 183017                         B2                         B1
## 183925                         B2                         B2
## 191662                         B2                         B1
## 225906                         B2                         B2
## 276769                         B2                         B2
## 288549                         B2                         B2
## 307826                         B1                         B1
## 309958                         B1                         B1
## 312065                         B1                         B1
## 315618                         B1                         B1
## 322157                         B1                         B1
## 335790                         B1                         B1
## 380565                         B2                         B2
##        bucket2009.fctr.prediction bucket2009.fctr.prediction.accurate
## 6                              B1                                TRUE
## 124097                         B2                               FALSE
## 129688                         B2                               FALSE
## 159234                         B2                               FALSE
## 169528                         B2                               FALSE
## 174385                         B2                               FALSE
## 182164                         B2                               FALSE
## 183017                         B2                               FALSE
## 183925                         B2                               FALSE
## 191662                         B2                               FALSE
## 225906                         B2                               FALSE
## 276769                         B2                               FALSE
## 288549                         B2                               FALSE
## 307826                         B1                               FALSE
## 309958                         B1                               FALSE
## 312065                         B1                               FALSE
## 315618                         B1                               FALSE
## 322157                         B1                               FALSE
## 335790                         B1                               FALSE
## 380565                         B2                                TRUE
##         .label
## 6           .6
## 124097 .124097
## 129688 .129688
## 159234 .159234
## 169528 .169528
## 174385 .174385
## 182164 .182164
## 183017 .183017
## 183925 .183925
## 191662 .191662
## 225906 .225906
## 276769 .276769
## 288549 .288549
## 307826 .307826
## 309958 .309958
## 312065 .312065
## 315618 .315618
## 322157 .322157
## 335790 .335790
## 380565 .380565

tmp_replay_lst <- replay.petrisim(pn=glb_analytics_pn, 
    replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs, 
        "data.new.prediction")), flip_coord=TRUE)
## time trans    "bgn " "fit.data.training.all " "predict.data.new " "end " 
## 0.0000   multiple enabled transitions:  data.training.all data.new model.selected    firing:  data.training.all 
## 1.0000    1   2 1 0 0 
## 1.0000   multiple enabled transitions:  data.training.all data.new model.selected model.final data.training.all.prediction   firing:  data.new 
## 2.0000    2   1 1 1 0 
## 2.0000   multiple enabled transitions:  data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction   firing:  model.selected 
## 3.0000    3   0 2 1 0 
## 3.0000   multiple enabled transitions:  model.final data.training.all.prediction data.new.prediction     firing:  data.training.all.prediction 
## 4.0000    5   0 1 1 1 
## 4.0000   multiple enabled transitions:  model.final data.training.all.prediction data.new.prediction     firing:  model.final 
## 5.0000    4   0 0 2 1 
## 6.0000    6   0 0 1 2

#print(ggplot.petrinet(tmp_replay_lst[["pn"]]) + coord_flip())

Null Hypothesis (\(\sf{H_{0}}\)): mpg is not impacted by am_fctr.
The variance by am_fctr appears to be independent.

# print(t.test(subset(cars_df, am_fctr == "automatic")$mpg, 
#              subset(cars_df, am_fctr == "manual")$mpg, 
#              var.equal=FALSE)$conf)

We reject the null hypothesis i.e. we have evidence to conclude that am_fctr impacts mpg (95% confidence). Manual transmission is better for miles per gallon versus automatic transmission.

##                   chunk_label chunk_step_major chunk_step_minor elapsed
## 10      fit.data.training.all                6                0  35.325
## 12           predict.data.new                7                0  58.094
## 2                cleanse_data                2                0   6.270
## 11      fit.data.training.all                6                1  40.509
## 6            extract_features                3                0  10.942
## 4         manage_missing_data                2                2   7.808
## 9                  run.models                5                0  13.903
## 7             select_features                4                0  12.299
## 5          encode_retype_data                2                3   8.228
## 8  remove_correlated_features                4                1  12.506
## 3       inspectORexplore.data                2                1   6.301
## 1                 import_data                1                0   0.002
##    elapsed_diff
## 10       21.422
## 12       17.585
## 2         6.268
## 11        5.184
## 6         2.714
## 4         1.507
## 9         1.397
## 7         1.357
## 5         0.420
## 8         0.207
## 3         0.031
## 1         0.000
## [1] "Total Elapsed Time: 58.094 secs"

## R version 3.1.3 (2015-03-09)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.2 (Yosemite)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] tcltk     grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] rpart.plot_1.5.2 rpart_4.1-9      caret_6.0-41     lattice_0.20-31 
##  [5] ROCR_1.0-7       gplots_2.16.0    reshape2_1.4.1   sqldf_0.4-10    
##  [9] RSQLite_1.0.0    DBI_0.3.1        gsubfn_0.6-6     proto_0.3-10    
## [13] plyr_1.8.1       caTools_1.17.1   doBy_4.5-13      survival_2.38-1 
## [17] ggplot2_1.0.1   
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6        BradleyTerry2_1.0-6 brglm_0.5-9        
##  [4] car_2.0-25          chron_2.3-45        class_7.3-12       
##  [7] codetools_0.2-11    colorspace_1.2-6    compiler_3.1.3     
## [10] digest_0.6.8        e1071_1.6-4         evaluate_0.5.5     
## [13] foreach_1.4.2       formatR_1.1         gdata_2.13.3       
## [16] gtable_0.1.2        gtools_3.4.1        htmltools_0.2.6    
## [19] iterators_1.0.7     KernSmooth_2.23-14  knitr_1.9          
## [22] labeling_0.3        lme4_1.1-7          MASS_7.3-40        
## [25] Matrix_1.2-0        mgcv_1.8-6          minqa_1.2.4        
## [28] munsell_0.4.2       nlme_3.1-120        nloptr_1.0.4       
## [31] nnet_7.3-9          parallel_3.1.3      pbkrtest_0.4-2     
## [34] quantreg_5.11       Rcpp_0.11.5         rmarkdown_0.5.1    
## [37] scales_0.2.4        SparseM_1.6         splines_3.1.3      
## [40] stringr_0.6.2       tools_3.1.3         yaml_2.1.13